I posted a thread here a few weeks ago asking for help coding a URI parser. I didn`t get much help but anyway here it is, feel free to use it, modify it delete it, or whatever. I coded it to the exact spec of RFC2396, and ive thouroghly tested it, and its 101% rock-solid.
feel free to critique my coding aswell, as long as its constructive. and if you find an error please let me know!!
here it is:
Filename: uri.c
Code:
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "uri.h"
void strlwr(char * c)
{
while (*c!='\0')
{
if (*c>=65 && *c<=90) *c+=32;
c++;
}
}
void strupr(char * c)
{
while (*c!='\0')
{
if (*c>=97 && *c<=122) *c-=32;
c++;
}
}
void freeptrs(void ** ptrs)
{
/* a generic function for freeing the data pointed to by an array of pointers */
while (*ptrs)
{
free (*ptrs);
*ptrs=NULL;
ptrs++;
}
}
bool is_reserved(char c)
{
if (c==';' || c== '/' || c=='?' || c==':' || c=='@' ||
c=='&' || c=='=' || c== '+' || c=='$' || c==',') return true; else return false;
}
bool is_mark(char c)
{
if (c=='-' || c== '_' || c=='.' || c=='!' || c=='~' || c=='*' || c=='\'' ||
c== '(' || c==')' ) return true; else return false;
}
bool is_upalpha(char c)
{
if (c>=65 && c<=90) return true; else return false;
}
bool is_lowalpha(char c)
{
if (c>=97 && c<=122) return true; else return false;
}
bool is_digit(char c)
{
if (c>=48 && c<=57) return true; else return false;
}
bool is_alpha(char c)
{
if (is_upalpha(c) || is_lowalpha(c)) return true; else return false;
}
bool is_alphanum(char c)
{
if (is_alpha(c) || is_digit(c)) return true; else return false;
}
bool is_unreserved(char c)
{
if (is_alphanum(c) || is_mark(c)) return true; else return false;
}
bool is_hex(char c)
{
if (is_digit(c)
||c=='A'||c=='B'||c=='C'||c=='D'||c=='E'||c=='F'
||c=='a'||c=='b'||c=='c'||c=='d'||c=='e'||c=='f'
) return true; else return false;
}
bool is_escaped(char * c)
{
if (*c=='%') if (is_hex(*(c+1))) if (is_hex(*(c+2))) return true;
return false;
}
void freeauthority(authority * authority)
{
if (authority->reg_name)
{
free(authority->reg_name);
authority->reg_name=NULL;
}
if (authority->userinfo)
{
free(authority->userinfo);
authority->userinfo=NULL;
}
if (authority->host)
{
free(authority->host);
authority->host=NULL;
}
if (authority->port)
{
authority->port=0;
}
}
void freeparsedURI(parsedURI * parseduri)
{
if (parseduri)
{
if (parseduri->scheme)
{
free(parseduri->scheme);
parseduri->scheme=NULL;
}
if (parseduri->opaque_part)
{
free(parseduri->opaque_part);
parseduri->opaque_part=NULL;
}
if (parseduri->abspathsegs)
{
freeptrs((void**)parseduri->abspathsegs);
free(parseduri->abspathsegs);
parseduri->abspathsegs=NULL;
}
if (parseduri->query)
{
free(parseduri->query);
parseduri->query=NULL;
}
if (parseduri->autho)
{
freeauthority(parseduri->autho);
free(parseduri->autho);
parseduri->autho=NULL;
}
free(parseduri);
parseduri=NULL;
}
}
void nullparsedURI(parsedURI * parseduri)
{
parseduri->scheme=NULL;
parseduri->opaque_part=NULL;
parseduri->autho=NULL;
parseduri->abspathsegs=NULL;
parseduri->query=NULL;
}
void nullauthority(authority * authority)
{
authority->reg_name=NULL;
authority->userinfo=NULL;
authority->host=NULL;
authority->port=0;
}
char ** verify_chop_abspath(char * firstchar,char * lastchar)
{
char ** segs;
char * mrkr1,* mrkr2;
unsigned long numsegs=1;
/* first make sure the string is at least 1 char long */
if (lastchar<firstchar) return (NULL);
mrkr1=firstchar;
while (mrkr1<=lastchar && !(*mrkr1=='/' && *(mrkr1+1)=='/') &&
(*mrkr1=='/'||is_unreserved(*mrkr1)||is_escaped(mrkr1)||*mrkr1==':'
||*mrkr1=='@'||*mrkr1=='&'||*mrkr1=='='||*mrkr1=='+'||*mrkr1=='$'||*mrkr1==','))
{
if (*mrkr1=='/' && mrkr1<lastchar) numsegs++;
mrkr1++;
}
if (mrkr1<=lastchar)
{
/* invalid abspath */
return(NULL);
}
/* allocate memory for pointers */
segs=(char**)malloc(sizeof(char*)*(numsegs+1));
/* NULL-terminate pointer array */
*(segs+numsegs)=NULL;
/* bring mrkr1 back to the start */
mrkr1=firstchar;
/* begin segment allocating and copying loop */
for(unsigned long i=0; i<numsegs; i++)
{
mrkr2=mrkr1;
while (*mrkr2!='/' && mrkr2<=lastchar) mrkr2++;
*(segs+i)=(char*)malloc((mrkr2-mrkr1)+1);
strncpy(*(segs+i),mrkr1,(mrkr2-mrkr1));
*((*(segs+i))+(mrkr2-mrkr1))='\0';
mrkr1=mrkr2+1;
}
return (segs);
}
authority * verify_fill_authority(char * firstchar,char * lastchar)
{
authority * autho;
char * mrkr1,* mrkr2,* mrkr3;
/* first make sure the string is at least 1 char long */
if (lastchar<firstchar) return NULL;
/* check that the string is fully legal */
mrkr1=firstchar;
/* these delimeters are used to seperate the authority into <userinfo>@<host>:<port> format.*/
char * pos_of_first_atsign=NULL;
char * pos_of_last_colon=NULL;
unsigned long num_atsigns=0;
unsigned long num_colons=0;
while (mrkr1<=lastchar && (is_unreserved(*mrkr1)||is_escaped(mrkr1)||*mrkr1=='$'||*mrkr1==','||*mrkr1==':'
||*mrkr1==';'||*mrkr1=='@'||*mrkr1=='&'||*mrkr1=='='||*mrkr1=='+'))
{
if (*mrkr1=='@')
{
num_atsigns++;
if (num_atsigns==1) pos_of_first_atsign=mrkr1;
}
else if (*mrkr1==':')
{
num_colons++;
pos_of_last_colon=mrkr1;
}
mrkr1++;
}
if (mrkr1<=lastchar)
{
/* invalid authority string */
return(NULL);
}
/* allocate and null the autho */
autho=(authority*)malloc(sizeof(authority));
nullauthority(autho);
if (num_atsigns>1 || (num_colons>1 && num_atsigns==0))
{
/* set reg name only */
autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
autho->reg_name[(lastchar-firstchar)+1]='\0';
return (autho);
}
/* it a combination of [<userinfo>'@'][<host>[':'<port>]] */
mrkr1=firstchar;
/* copy the userinfo */
if (num_atsigns>0)
{
if ((pos_of_first_atsign-firstchar)>0)
{
autho->userinfo=(char*)malloc((pos_of_first_atsign-firstchar)+1);
strncpy(autho->userinfo,firstchar,(pos_of_first_atsign-firstchar));
autho->userinfo[pos_of_first_atsign-firstchar]='\0';
}
mrkr1=pos_of_first_atsign+1;
}
/* mrkr1 now points to the first character in the hostport */
if ((num_colons!=0 && num_atsigns==0)||(num_colons!=0 && num_atsigns==1 && pos_of_last_colon>pos_of_first_atsign))
mrkr2=pos_of_last_colon-1; else mrkr2=lastchar;
/* check the host */
mrkr3=mrkr1;
while (mrkr3<=mrkr2 && (is_alphanum(*mrkr3)||*mrkr3=='-'||*mrkr3=='.') &&
(!(*mrkr3=='-' && ((!is_alphanum(*(mrkr3-1))) || (!is_alphanum(*(mrkr3+1))))))
&& (!(*mrkr3=='.'&&*(mrkr3+1)=='.')) && (!(*mrkr3=='.'&&(mrkr1==mrkr3)))) mrkr3++;
/* copy the host */
if (mrkr2>=mrkr1 && mrkr3>mrkr2)
{
autho->host=(char*)malloc((mrkr2-mrkr1)+2);
strncpy(autho->host,mrkr1,(mrkr2-mrkr1)+1);
autho->host[(mrkr2-mrkr1)+1]='\0';
}
else
{
freeauthority(autho);
/* set reg name only */
autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
autho->reg_name[(lastchar-firstchar)+1]='\0';
return (autho);
}
/* attempt to extract a port number */
if ((num_colons==1 && num_atsigns==0)||(num_colons!=0 && num_atsigns==1 && pos_of_last_colon>pos_of_first_atsign))
{
mrkr1=pos_of_last_colon+1;
mrkr2=mrkr1;
while (mrkr2<=lastchar && is_digit(*mrkr2)) mrkr2++;
if ((mrkr2-1)==lastchar) sscanf(mrkr1,"%hu",&autho->port);
else
{
freeauthority(autho);
/* set reg name only */
autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
autho->reg_name[(lastchar-firstchar)+1]='\0';
return (autho);
}
}
return (autho);
}
parsedURI * parseURI(char * URI)
{
char * mrkr1,*mrkr2;
parsedURI * parseduri=(parsedURI *)malloc(sizeof(parsedURI));
if (parseduri==NULL) return (NULL);
/* null all the members */
nullparsedURI(parseduri);
/* start with the scheme */
mrkr1=URI;
if (!is_alpha(*mrkr1))
{
freeparsedURI(parseduri);
return(NULL);
}
mrkr1++;
while (*mrkr1!=':' && *mrkr1!='\0' && (is_alpha(*mrkr1) ||
is_digit(*mrkr1) || *mrkr1=='+' || *mrkr1=='-' || *mrkr1=='.')) mrkr1++;
if (*mrkr1!=':' || *(mrkr1+1)=='\0')
{
freeparsedURI(parseduri);
return(NULL);
}
/* a valid scheme has been found (for now) */
parseduri->scheme=(char*)malloc((mrkr1-URI)+1);
strncpy(parseduri->scheme,URI,(mrkr1-URI));
parseduri->scheme[mrkr1-URI]='\0';
/* convert scheme to lower-case */
strlwr(parseduri->scheme);
/* go on to the character after the ':` */
mrkr1++;
if (*mrkr1!='/')
{
/* no slash after the colon. this means the rest of the URI is opaque PROVIDED the rest
of the URI dies not contain any illegal characters */
mrkr2=mrkr1;
while (*mrkr2!='\0' && (is_reserved(*mrkr2)||is_unreserved(*mrkr2)||is_escaped(mrkr2))) mrkr2++;
if (*mrkr2!='\0')
{
/* oops...an illegal character */
freeparsedURI(parseduri);
return(NULL);
}
/* valid opaque_part */
parseduri->opaque_part=(char*)malloc((mrkr2-mrkr1)+1);
strncpy(parseduri->opaque_part,mrkr1,(mrkr2-mrkr1));
parseduri->opaque_part[mrkr2-mrkr1]='\0';
}
else
{
mrkr1++;
if (*mrkr1=='\0'||*mrkr1=='?')
{
/* nothing else...invalid. */
freeparsedURI(parseduri);
return(NULL);
}
else if (*mrkr1!='/')
{
/* no net path */
mrkr2=mrkr1;
while (*mrkr2!='\0' && *mrkr2!='?') mrkr2++;
parseduri->abspathsegs=verify_chop_abspath(mrkr1,mrkr2-1);
if (!parseduri->abspathsegs)
{
/* invalid abspath */
freeparsedURI(parseduri);
return(NULL);
}
}
else
{
/* possibly a net path */
mrkr1++;
if (*mrkr1=='\0'||*mrkr1=='?')
{
/* nothing else...invalid. */
freeparsedURI(parseduri);
return(NULL);
}
mrkr2=mrkr1;
while (*mrkr2!='\0' && *mrkr2!='/' && *mrkr2!='?') mrkr2++;
parseduri->autho=verify_fill_authority(mrkr1,mrkr2-1);
if (!parseduri->autho)
{
/* invalid authority */
freeparsedURI(parseduri);
return(NULL);
}
/* do the abspath */
if (*mrkr2=='/' && *(mrkr2+1)!='\0' && *(mrkr2+1)!='?')
{
mrkr1=++mrkr2;
while (*mrkr2!='\0' && *mrkr2!='?') mrkr2++;
parseduri->abspathsegs=verify_chop_abspath(mrkr1,mrkr2-1);
if (!parseduri->abspathsegs)
{
/* invalid abspath */
freeparsedURI(parseduri);
return(NULL);
}
}
}
/* time for the query. */
mrkr1=strchr(URI,'?');
if (mrkr1 && *(mrkr1+1)!='\0')
{
mrkr1++;
mrkr2=mrkr1;
while (*mrkr2!='\0' && (is_reserved(*mrkr2)||is_unreserved(*mrkr2)||is_escaped(mrkr2))) mrkr2++;
if (*mrkr2!='\0')
{
/* invalid query */
freeparsedURI(parseduri);
return(NULL);
}
/* allocate for and copy the query */
parseduri->query=(char*)malloc((mrkr2-mrkr1)+1);
strncpy(parseduri->query,mrkr1,mrkr2-mrkr1);
*((parseduri->query)+(mrkr2-mrkr1))='\0';
}
}
return (parseduri);
}
Filename: uri.h
Code:
#ifndef _URI_H
#define _URI_H
typedef struct _authority
{
char * reg_name;
char * userinfo;
char * host;
unsigned short port; /* set to zero if no port speified */
}
authority;
typedef struct _parsedURI
{
char * scheme;
char * opaque_part;
authority * autho;
char ** abspathsegs;
char * query;
}
parsedURI;
/* general purpose functions */
void strlwr(char * c);
void strupr(char * c);
void freeptrs(void ** ptrs);
bool is_reserved(char c);
bool is_mark(char c);
bool is_upalpha(char c);
bool is_lowalpha(char c);
bool is_digit(char c);
bool is_alpha(char c);
bool is_alphanum(char c);
bool is_unreserved(char c);
bool is_hex(char c);
bool is_escaped(char * c);
void freeauthority(authority * authority);
void freeparsedURI(parsedURI * parseduri);
void nullparsedURI(parsedURI * parseduri);
void nullauthority(authority * authority);
char ** verify_chop_abspath(char * firstchar,char * lastchar);
authority * verify_fill_authority(char * firstchar,char * lastchar);
parsedURI * parseURI(char * URI);
#endif
Filename: example.c
Code:
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "uri.h"
int main()
{
parsedURI * parseduri;
char * buff=(char*)malloc(100);
printf("Enter URI:");
scanf("%s",buff);
parseduri=parseURI(buff);
if (parseduri)
{
printf("URI was parsed\n");
if (parseduri->scheme)
{
printf("scheme:");
puts(parseduri->scheme);
}
if (parseduri->opaque_part)
{
printf("opaque_part:");
puts(parseduri->opaque_part);
}
if (parseduri->autho)
{
if (parseduri->autho->reg_name)
{
printf("reg_name:");
puts(parseduri->autho->reg_name);
}
if (parseduri->autho->userinfo)
{
printf("userinfo:");
puts(parseduri->autho->userinfo);
}
if (parseduri->autho->host)
{
printf("host:");
puts(parseduri->autho->host);
}
if (parseduri->autho->port)
{
printf("port:%hu\n",parseduri->autho->port);
}
}
if (parseduri->abspathsegs)
{
char ** curseg=parseduri->abspathsegs;
while (*curseg)
{
printf("Absolute path segment:");
puts(*curseg);
curseg++;
}
}
if (parseduri->query)
{
printf("query:");
puts(parseduri->query);
}
}
else
{
printf("URI was not parsed\n");
}
freeparsedURI(parseduri);
free(buff);
return(0);
}