LinuxQuestions.org
Visit Jeremy's Blog.
Home Forums Tutorials Articles Register
Go Back   LinuxQuestions.org > Forums > Non-*NIX Forums > Programming
User Name
Password
Programming This forum is for all programming questions.
The question does not have to be directly related to Linux and any language is fair game.

Notices


Reply
  Search this Thread
Old 05-02-2004, 10:39 PM   #1
nodger
Member
 
Registered: Oct 2003
Location: Ireland
Distribution: Slackware 9.1, Ubuntu
Posts: 192

Rep: Reputation: 30
Talking URI parser


I posted a thread here a few weeks ago asking for help coding a URI parser. I didn`t get much help but anyway here it is, feel free to use it, modify it delete it, or whatever. I coded it to the exact spec of RFC2396, and ive thouroghly tested it, and its 101% rock-solid.

feel free to critique my coding aswell, as long as its constructive. and if you find an error please let me know!!

here it is:

Filename: uri.c

Code:
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "uri.h"

void strlwr(char * c)
{
	while (*c!='\0')
	{
		if (*c>=65 && *c<=90) *c+=32;
		c++;
	}
}

void strupr(char * c)
{
	while (*c!='\0')
	{
		if (*c>=97 && *c<=122) *c-=32;
		c++;
	}
}

void freeptrs(void ** ptrs)
{
	/* a generic function for freeing the data pointed to by an array of pointers */
	while (*ptrs)
	{
		free (*ptrs);
		*ptrs=NULL;
		ptrs++;
	}
}

bool is_reserved(char c)
{
	if (c==';' || c== '/' || c=='?' || c==':' || c=='@' ||
	 c=='&' || c=='=' || c== '+' || c=='$' || c==',') return true; else return false;
}

bool is_mark(char c)
{
	if (c=='-' || c== '_' || c=='.' || c=='!' || c=='~' || c=='*' || c=='\'' ||
	 c== '(' || c==')' ) return true; else return false;
}

bool is_upalpha(char c)
{
	if (c>=65 && c<=90) return true; else return false;
}

bool is_lowalpha(char c)
{
	if (c>=97 && c<=122) return true; else return false;
}

bool is_digit(char c)
{
	if (c>=48 && c<=57) return true; else return false;
}

bool is_alpha(char c)
{
	if (is_upalpha(c) || is_lowalpha(c)) return true; else return false;
}

bool is_alphanum(char c)
{
	if (is_alpha(c) || is_digit(c)) return true; else return false;
}

bool is_unreserved(char c)
{
	if (is_alphanum(c) || is_mark(c)) return true; else return false;
}

bool is_hex(char c)
{
	if (is_digit(c) 
	||c=='A'||c=='B'||c=='C'||c=='D'||c=='E'||c=='F'
	||c=='a'||c=='b'||c=='c'||c=='d'||c=='e'||c=='f'
	) return true; else return false;
}

bool is_escaped(char * c)
{
	if (*c=='%') if (is_hex(*(c+1))) if (is_hex(*(c+2))) return true;
	return false;
}



void freeauthority(authority * authority)
{
	if (authority->reg_name)
	{
		free(authority->reg_name);
		authority->reg_name=NULL;
	}
	if (authority->userinfo)
	{
		free(authority->userinfo);
		authority->userinfo=NULL;
	}
	if (authority->host)
	{
		free(authority->host);
		authority->host=NULL;
	}
	if (authority->port)
	{
		authority->port=0;
	}
}

void freeparsedURI(parsedURI * parseduri)
{
	if (parseduri)
	{
		if (parseduri->scheme)
		{
			free(parseduri->scheme);
			parseduri->scheme=NULL;
		}
		if (parseduri->opaque_part)
		{
			free(parseduri->opaque_part);
			parseduri->opaque_part=NULL;
		}
		if (parseduri->abspathsegs)
		{
			freeptrs((void**)parseduri->abspathsegs);
			free(parseduri->abspathsegs);
			parseduri->abspathsegs=NULL;
		}
		if (parseduri->query)
		{
			free(parseduri->query);
			parseduri->query=NULL;
		}
		if (parseduri->autho)
		{
			freeauthority(parseduri->autho);
			free(parseduri->autho);
			parseduri->autho=NULL;
		}
		free(parseduri);
		parseduri=NULL;
	}
}

void nullparsedURI(parsedURI * parseduri)
{
	parseduri->scheme=NULL;
	parseduri->opaque_part=NULL;
	parseduri->autho=NULL;
	parseduri->abspathsegs=NULL;
	parseduri->query=NULL;
}

void nullauthority(authority * authority)
{
	authority->reg_name=NULL;
	authority->userinfo=NULL;
	authority->host=NULL;
	authority->port=0;
}

char ** verify_chop_abspath(char * firstchar,char * lastchar)
{
	char ** segs;
	char * mrkr1,* mrkr2;
	unsigned long numsegs=1;
	
	/* first make sure the string is at least 1 char long */
	if (lastchar<firstchar) return (NULL);
	
	mrkr1=firstchar;
	while (mrkr1<=lastchar && !(*mrkr1=='/' && *(mrkr1+1)=='/') && 
	(*mrkr1=='/'||is_unreserved(*mrkr1)||is_escaped(mrkr1)||*mrkr1==':'
	||*mrkr1=='@'||*mrkr1=='&'||*mrkr1=='='||*mrkr1=='+'||*mrkr1=='$'||*mrkr1==','))
	{
		if (*mrkr1=='/' && mrkr1<lastchar) numsegs++;
		mrkr1++;
	}
	if (mrkr1<=lastchar)
	{
		/* invalid abspath */
		return(NULL);
	}
	
	/* allocate memory for pointers */
	segs=(char**)malloc(sizeof(char*)*(numsegs+1));
	
	/* NULL-terminate pointer array */
	*(segs+numsegs)=NULL;
	
	/* bring mrkr1 back to the start */
	mrkr1=firstchar;
	
	/* begin segment allocating and copying loop */
	for(unsigned long i=0; i<numsegs; i++)
	{
		mrkr2=mrkr1;
		while (*mrkr2!='/' && mrkr2<=lastchar) mrkr2++;
		*(segs+i)=(char*)malloc((mrkr2-mrkr1)+1);
		strncpy(*(segs+i),mrkr1,(mrkr2-mrkr1));
		*((*(segs+i))+(mrkr2-mrkr1))='\0';
		mrkr1=mrkr2+1;
	}
	return (segs);
}

authority * verify_fill_authority(char * firstchar,char * lastchar)
{
	authority * autho;
	char * mrkr1,* mrkr2,* mrkr3;
	
	/* first make sure the string is at least 1 char long */
	if (lastchar<firstchar) return NULL;
	
	/* check that the string is fully legal */
	mrkr1=firstchar;
	
	/* these delimeters are used to seperate the authority into <userinfo>@<host>:<port> format.*/
	char * pos_of_first_atsign=NULL;
	char * pos_of_last_colon=NULL;
	unsigned long num_atsigns=0;
	unsigned long num_colons=0;
	
	while (mrkr1<=lastchar && (is_unreserved(*mrkr1)||is_escaped(mrkr1)||*mrkr1=='$'||*mrkr1==','||*mrkr1==':'
	||*mrkr1==';'||*mrkr1=='@'||*mrkr1=='&'||*mrkr1=='='||*mrkr1=='+')) 
	{
		if (*mrkr1=='@') 
		{
			num_atsigns++;
			if (num_atsigns==1) pos_of_first_atsign=mrkr1;
		}
		else if (*mrkr1==':')
		{
			num_colons++;
			pos_of_last_colon=mrkr1;
		}
		mrkr1++;
	}

	if (mrkr1<=lastchar)
	{
		/* invalid authority string */
		return(NULL);
	}
	
	/* allocate and null the autho */
	autho=(authority*)malloc(sizeof(authority));
	nullauthority(autho);
	
	if (num_atsigns>1 || (num_colons>1 && num_atsigns==0))
	{
		/* set reg name only */
		autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
		strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
		autho->reg_name[(lastchar-firstchar)+1]='\0';
		return (autho);
	}
	
	/* it a combination of [<userinfo>'@'][<host>[':'<port>]] */
	
	mrkr1=firstchar;
	
	/* copy the userinfo */
	if (num_atsigns>0)
	{
		if ((pos_of_first_atsign-firstchar)>0)
		{
			autho->userinfo=(char*)malloc((pos_of_first_atsign-firstchar)+1);
			strncpy(autho->userinfo,firstchar,(pos_of_first_atsign-firstchar));
			autho->userinfo[pos_of_first_atsign-firstchar]='\0';
		}
		mrkr1=pos_of_first_atsign+1;
	}
	
	/* mrkr1 now points to the first character in the hostport */
	
	if ((num_colons!=0 && num_atsigns==0)||(num_colons!=0 && num_atsigns==1 && pos_of_last_colon>pos_of_first_atsign)) 
	mrkr2=pos_of_last_colon-1; else mrkr2=lastchar;
	
	/* check the host */
	mrkr3=mrkr1;
	while (mrkr3<=mrkr2 && (is_alphanum(*mrkr3)||*mrkr3=='-'||*mrkr3=='.') && 
	(!(*mrkr3=='-' && ((!is_alphanum(*(mrkr3-1))) || (!is_alphanum(*(mrkr3+1))))))
	&& (!(*mrkr3=='.'&&*(mrkr3+1)=='.'))  && (!(*mrkr3=='.'&&(mrkr1==mrkr3)))) mrkr3++;
	
	/* copy the host */
	if (mrkr2>=mrkr1 && mrkr3>mrkr2) 
	{
		autho->host=(char*)malloc((mrkr2-mrkr1)+2);
		strncpy(autho->host,mrkr1,(mrkr2-mrkr1)+1);
		autho->host[(mrkr2-mrkr1)+1]='\0';
	}
	else
	{
		freeauthority(autho);
		/* set reg name only */
		autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
		strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
		autho->reg_name[(lastchar-firstchar)+1]='\0';
		return (autho);
	}	
	
	/* attempt to extract a port number */
	if ((num_colons==1 && num_atsigns==0)||(num_colons!=0 && num_atsigns==1 && pos_of_last_colon>pos_of_first_atsign))
	{
		mrkr1=pos_of_last_colon+1;
		mrkr2=mrkr1;
		while (mrkr2<=lastchar && is_digit(*mrkr2)) mrkr2++;
		if ((mrkr2-1)==lastchar) sscanf(mrkr1,"%hu",&autho->port);
		else
		{
			freeauthority(autho);
			/* set reg name only */
			autho->reg_name=(char*)malloc((lastchar-firstchar)+2);
			strncpy(autho->reg_name,firstchar,(lastchar-firstchar)+1);
			autho->reg_name[(lastchar-firstchar)+1]='\0';
			return (autho);
		}
	}
		
	return (autho);
}

parsedURI * parseURI(char * URI)
{
	char * mrkr1,*mrkr2;
	parsedURI * parseduri=(parsedURI *)malloc(sizeof(parsedURI));
	if (parseduri==NULL) return (NULL);
	
	/* null all the members */
	nullparsedURI(parseduri);
	
	/* start with the scheme */
	mrkr1=URI;
	if (!is_alpha(*mrkr1))
	{
		freeparsedURI(parseduri);
		return(NULL);
	}
	mrkr1++;
	while (*mrkr1!=':' && *mrkr1!='\0' && (is_alpha(*mrkr1) || 
	is_digit(*mrkr1) || *mrkr1=='+' || *mrkr1=='-' || *mrkr1=='.')) mrkr1++;
	if (*mrkr1!=':' || *(mrkr1+1)=='\0')
	{
		freeparsedURI(parseduri);
		return(NULL);
	}
	/* a valid scheme has been found (for now) */
	parseduri->scheme=(char*)malloc((mrkr1-URI)+1);
	strncpy(parseduri->scheme,URI,(mrkr1-URI));
	parseduri->scheme[mrkr1-URI]='\0';
	
	/* convert scheme to lower-case */
	strlwr(parseduri->scheme);
	
	/* go on to the character after the ':` */
	mrkr1++;
	
	if (*mrkr1!='/')
	{
		/* no slash after the colon. this means the rest of the URI is opaque PROVIDED the rest
		of the URI dies not contain any illegal characters */
		mrkr2=mrkr1;
		while (*mrkr2!='\0' && (is_reserved(*mrkr2)||is_unreserved(*mrkr2)||is_escaped(mrkr2))) mrkr2++;
		if (*mrkr2!='\0')
		{
			/* oops...an illegal character */
			freeparsedURI(parseduri);
			return(NULL);
		}
		
		/* valid opaque_part */
		parseduri->opaque_part=(char*)malloc((mrkr2-mrkr1)+1);
		strncpy(parseduri->opaque_part,mrkr1,(mrkr2-mrkr1));
		parseduri->opaque_part[mrkr2-mrkr1]='\0';
	}
	else
	{
		mrkr1++;
		if (*mrkr1=='\0'||*mrkr1=='?')
		{
			/* nothing else...invalid. */
			freeparsedURI(parseduri);
			return(NULL);
		}
		else if (*mrkr1!='/')
		{
			/* no net path */
			mrkr2=mrkr1;
			while (*mrkr2!='\0' && *mrkr2!='?') mrkr2++;
			parseduri->abspathsegs=verify_chop_abspath(mrkr1,mrkr2-1);
			if (!parseduri->abspathsegs)
			{
				/* invalid abspath */
				freeparsedURI(parseduri);
				return(NULL);
			}
		}
		else
		{
			/* possibly a net path */
			mrkr1++;
			if (*mrkr1=='\0'||*mrkr1=='?')
			{
				/* nothing else...invalid. */
				freeparsedURI(parseduri);
				return(NULL);
			}
			
			mrkr2=mrkr1;
			while (*mrkr2!='\0' && *mrkr2!='/' && *mrkr2!='?') mrkr2++;
			parseduri->autho=verify_fill_authority(mrkr1,mrkr2-1);
			if (!parseduri->autho)
			{
				/* invalid authority */
				freeparsedURI(parseduri);
				return(NULL);
			}
			
			/* do the abspath */
			if (*mrkr2=='/' && *(mrkr2+1)!='\0' && *(mrkr2+1)!='?')
			{
				mrkr1=++mrkr2;
				while (*mrkr2!='\0' && *mrkr2!='?') mrkr2++;
				parseduri->abspathsegs=verify_chop_abspath(mrkr1,mrkr2-1);
				if (!parseduri->abspathsegs)
				{
					/* invalid abspath */
					freeparsedURI(parseduri);
					return(NULL);
				}
			}
		}
		
		/* time for the query. */
		mrkr1=strchr(URI,'?');
		if (mrkr1 && *(mrkr1+1)!='\0')
		{
			mrkr1++;
			mrkr2=mrkr1;
			while (*mrkr2!='\0' && (is_reserved(*mrkr2)||is_unreserved(*mrkr2)||is_escaped(mrkr2))) mrkr2++;
			if (*mrkr2!='\0')
			{
				/* invalid query */
				freeparsedURI(parseduri);
				return(NULL);
			}
			
			/* allocate for and copy the query */
			parseduri->query=(char*)malloc((mrkr2-mrkr1)+1);
			strncpy(parseduri->query,mrkr1,mrkr2-mrkr1);
			*((parseduri->query)+(mrkr2-mrkr1))='\0';
		}
	}
	return (parseduri);
}
Filename: uri.h

Code:
#ifndef _URI_H

#define _URI_H

typedef struct _authority
{
	char * reg_name;
	char * userinfo;
	char * host;
	unsigned short port; /* set to zero if no port speified */
}
authority;

typedef struct _parsedURI
{
	char * scheme;
	char * opaque_part;
	authority * autho;
	char ** abspathsegs;
	char * query;
}
parsedURI;

/* general purpose functions */
void strlwr(char * c);
void strupr(char * c);
void freeptrs(void ** ptrs);
bool is_reserved(char c);
bool is_mark(char c);
bool is_upalpha(char c);
bool is_lowalpha(char c);
bool is_digit(char c);
bool is_alpha(char c);
bool is_alphanum(char c);
bool is_unreserved(char c);
bool is_hex(char c);
bool is_escaped(char * c);
void freeauthority(authority * authority);
void freeparsedURI(parsedURI * parseduri);
void nullparsedURI(parsedURI * parseduri);
void nullauthority(authority * authority);
char ** verify_chop_abspath(char * firstchar,char * lastchar);
authority * verify_fill_authority(char * firstchar,char * lastchar);
parsedURI * parseURI(char * URI);
#endif
Filename: example.c
Code:
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "uri.h"


int main()
{
	parsedURI * parseduri;
	char * buff=(char*)malloc(100);
	printf("Enter URI:");
	scanf("%s",buff);
	parseduri=parseURI(buff);
	if (parseduri)
	{
		printf("URI was parsed\n");
		if (parseduri->scheme)
		{
			printf("scheme:");
			puts(parseduri->scheme);
		}
		if (parseduri->opaque_part)
		{
			printf("opaque_part:");
			puts(parseduri->opaque_part);
		}
		if (parseduri->autho)
		{
			if (parseduri->autho->reg_name)
			{
				printf("reg_name:");
				puts(parseduri->autho->reg_name);
			}
			if (parseduri->autho->userinfo)
			{
				printf("userinfo:");
				puts(parseduri->autho->userinfo);
			}
			if (parseduri->autho->host)
			{
				printf("host:");
				puts(parseduri->autho->host);
			}
			if (parseduri->autho->port)
			{
				printf("port:%hu\n",parseduri->autho->port);
			}
		}
		if (parseduri->abspathsegs)
		{
			char ** curseg=parseduri->abspathsegs;
			while (*curseg)
			{
				printf("Absolute path segment:");
				puts(*curseg);
				curseg++;
			}
		}
		if (parseduri->query)
		{
			printf("query:");
			puts(parseduri->query);
		}	
	}
	else
	{
		printf("URI was not parsed\n");
	}
	freeparsedURI(parseduri);
	free(buff);
	return(0);
}
 
Old 05-03-2004, 10:56 AM   #2
infamous41md
Member
 
Registered: Mar 2003
Posts: 804

Rep: Reputation: 30
nice! im going to check it out later tonigh after class
 
  


Reply



Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is Off
HTML code is Off



Similar Threads
Thread Thread Starter Forum Replies Last Post
checking for XML::Parser... configure: error: XML::Parser perl module is required for kornerr Linux - General 11 11-16-2008 07:24 AM
CUPS: What URI to use when a username is needed without a password elluva Linux - Networking 0 02-26-2005 10:19 AM
Squid external_acl with Request-URI Trano Linux - Software 0 01-20-2005 10:38 PM
CUPS --> URI settings carboncopy Slackware 1 08-13-2003 12:48 AM
konqueror: how to send current page location and title in uri?? grease Linux - General 2 06-07-2003 10:40 PM

LinuxQuestions.org > Forums > Non-*NIX Forums > Programming

All times are GMT -5. The time now is 12:09 AM.

Main Menu
Advertisement
My LQ
Write for LQ
LinuxQuestions.org is looking for people interested in writing Editorials, Articles, Reviews, and more. If you'd like to contribute content, let us know.
Main Menu
Syndicate
RSS1  Latest Threads
RSS1  LQ News
Twitter: @linuxquestions
Open Source Consulting | Domain Registration