[SOLVED] need help to further *optimize* my small C program that produces absolute path

konsolebox · 08-10-2009, 08:22 AM

it's sourced or can be called if it's the main caller.

konsolebox · 08-11-2009, 01:15 AM

Quote:

Originally Posted by ntubski

If we're still in the micro-optimization game

I think I still am.

Quote:

Originally Posted by ntubski

, then I would suggest optimizing for the common case: directories which are not "." or "..". This way for most of the time the code doesn't have to look out for dots, just slash and end of string. EDIT: And by putting a '/' at the end of workBuffer a lot of checks for end of string can be removed. By the way, I uncovered a bug when I tried this out:

Code:

 workBufferSize += cwdLength  + 1; // +1 is needed for the '/' between cwd and sourceArg

Code:

    // ensure that workBuffer ends in "/", to reduce checks for '\0'
    // exercise for reader: rewrite so realloc isn't needed
    endWithSlash = (sourceArg[sourceArgSize-2] == '/');
    if (!endWithSlash) {
        workBufferSize += 1;
        workBuffer = realloc(workBuffer, workBufferSize);
        workBuffer[workBufferSize-2] = '/';
        workBuffer[workBufferSize-1] = '\0';
    }

    t = 0;

    for (p = workBuffer + 1; *p; ++p) {
        if (p[0] == '.') {
            if (p[1] == '/') {
                // "." means same dir
                p += 1;
                continue;
            } else if (p[1] == '.') {
                if (p[2] == '/') {
                    // ".." means go up one dir
                    if (t > 0) --t;
                    p += 2;
                    continue;
                }
            }
        } else if (p[0] == '/') {
            continue;
        }

        // not "." or ".."
        CHECK_TOKENS(t);
        tokens[t++] = p;
        do {
            ++p;
        } while (p[0] != '/');
        p[0] = '\0';
    }

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    if (endWithSlash) putchar('/');
    else if (t == 0) fputs("/.", stdout);
    putchar('\n');

I tried to incorporate your changes to the original code but there was something that did not work right. When I tried to do something like:

Code:

./getabspath ../abc

the code will only print out <parent>/a

But anyway the suggestion is helpful and I'm still finding a way how to use it to improve the code although I still can't.

However also, when I was looking at the original code, I found a solution in which the code will no longer need malloc()s. That's by immediately binding the tokens of the cwd to the tokens array. When I made the changes, I also included some of the tweaks that was recommended in this thread. Here's my current code:

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

void reportMaxTokensReached (void);

int main (int argc, char ** argv) {
	register char * p;

	char * token;
	char * tokens [MAXTOKENS];
	unsigned int t;

	char cwd [CWDBUFFERSIZE];
	unsigned int cwdLength;

	unsigned int dotsCount;
	unsigned int hasNonDots;

	unsigned int i;

	#ifdef CHECKTOKENS
		#define TOKENSPLUS(T) \
			if (t == MAXTOKENS) { \
				reportMaxTokensReached(); \
				return 1; \
			} \
			tokens[t++] = T;
	#else
		#define TOKENSPLUS(T) \
			tokens[t++] = T;
	#endif

 	if (argc == 1) 	{
		if (getcwd(cwd, CWDBUFFERSIZE)) {
			puts((char const *) cwd);
			return 0;
		} else {
			return 1;
		}
	}

	t = 0;

	if (*argv[1] != '/') {
		if (! getcwd(cwd, CWDBUFFERSIZE))
			return 1;

		if (cwd[1]) {
			tokens[0] = p = cwd + 1;
			t++;

			while (*++p) {
				if (*p == '/') {
					*p = '\0';
					TOKENSPLUS(++p);	// everything after '/' is certainly not '\0'
				}
			}
		}
	}

	dotsCount = 0;
	hasNonDots = 0;

	for (p = token = argv[1]; *p; ++p) {
		switch (*p) {
		case '/':
			if (hasNonDots) {
				TOKENSPLUS(token);
			} else if (dotsCount > 1) {
				if (dotsCount == 2) {
					if (t)
						--t;
				} else {
					TOKENSPLUS(token);
				}
			}

			dotsCount = 0;
			hasNonDots = 0;
			*p = '\0';
			token = p + 1;
			continue;
		case '.':
			++dotsCount;
			continue;
		default:
			++hasNonDots;
		}
	}

	if (hasNonDots) {
		TOKENSPLUS(token);
	} else if (dotsCount > 1) {
		if (dotsCount == 2) {
			if (t)
				--t;
		} else {
			TOKENSPLUS(token);
		}
	}

	if (p[-1]) {
		// last character is not a forward slash ('/')

		if (t) {
			for (i = 0; i < t; i++) {
				putchar('/');
				fputs(tokens[i], stdout);
			}

			puts("");
		} else {
			puts("/.");
		}
	} else {
		if (t) {
			for (i = 0; i < t; i++) {
				putchar('/');
				fputs(tokens[i], stdout);
			}
		}

		puts("/");
	}

	return 0;
}

void reportMaxTokensReached (void) {
	printf("path has too many tokens.\n");
}

Cheers.

ntubski · 08-11-2009, 05:58 PM

Quote:

Originally Posted by konsolebox

the code will only print out <parent>/a

Did you fix the bug I pointed out?

Quote:

Code:

	#ifdef CHECKTOKENS
		#define TOKENSPLUS(T) \
			if (t == MAXTOKENS) { \
				reportMaxTokensReached(); \
				return 1; \
			} \
			tokens[t++] = T;
	#else
		#define TOKENSPLUS(T) \
			tokens[t++] = T;
	#endif

Still don't believe in inline functions?

If you want to avoid malloc'ing and checking the length of argv[1], the trick with the slash doesn't work, but here's the code with the other changes.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/" 
         : (t > 0)? "" 
         : "/.");

    return 0;
}

konsolebox · 08-13-2009, 04:07 AM

Quote:

Originally Posted by ntubski

Did you fix the bug I pointed out?

You mean realloc() or the one I just said?

Quote:

Originally Posted by ntubski

Still don't believe in inline functions?

I'm just not used to it and I was a bit of in a hurry that time.

Quote:

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/" 
         : (t > 0)? "" 
         : "/.");

    return 0;
}

If we really want to use goto statement there are lots of them that's waiting to be included. Lately I've been so curious with the compiler's output that I've been chasing its optimized outputs with my code and now I'm wondering why strcpy() is not expanding as an internal builtin even if I directly use __builtin_strcpy() and set the compilation to include new instructions like sse, sse2, sse3, ssse3, and sse4 and also use the -fhosted option. I don't know why it doesn't expand to a non-call set of instructions while strlen() expands perfectly. If I'll be able to know how to expand this routine to a builtin, it will not only help me with the code but also help me with the optimization of my whole system. It will also be helpful to the future programs that I might make.

Currently, my plan is to install many versions of gcc to my system especially newer ones like 4.4.1 and 4.5 if possible but I need optimize my system first before it.

About the new code, I'll have a copy of it again and look at it later when I have extra time or when I'm studying the compiler.

I haven't forgotten the scripts by the way. The internals of the script is already complete as expected and no longer require any revision but sadly the interface (callable functions and options) did not, so I thought maybe I need to rest for a while and try to finish it again later when I'm ready. Also I already have choices for the license or agreement that I'll use. Probably it's CC's attribution 3.0, CC0 1.0 or simply just public domain. I'll choose after I have a complete review of the three.

konsolebox · 08-14-2009, 04:23 AM

Quote:

Originally Posted by ntubski

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/" 
         : (t > 0)? "" 
         : "/.");

    return 0;
}

I didn't notice the changes immediately. You've made the code so much simpler. What happened to the two variables? Dude you really are a genius.

. Thank you very much!

ntubski · 08-14-2009, 02:51 PM

Quote:

Originally Posted by konsolebox

You mean realloc() or the one I just said?

I mean the missing + 1 in

Code:

workBufferSize += (cwdLength = strlen(cwd));

I realize I posted a different version of this in my earlier post, oops.

Quote:

If we really want to use goto statement there are lots of them that's waiting to be included.

The goto statement is not there for optimization, if that's what you're thinking. C lacks something like Java's labeled break, so I use goto with a comment. I don't think there is a clearer way of breaking out of nested loops.

Quote:

Lately I've been so curious with the compiler's output that I've been chasing its optimized outputs with my code and now I'm wondering why strcpy() is not expanding as an internal builtin even if I directly use __builtin_strcpy() and set the compilation to include new instructions like sse, sse2, sse3, ssse3, and sse4 and also use the -fhosted option. I don't know why it doesn't expand to a non-call set of instructions while strlen() expands perfectly.

According to http://en.wikibooks.org/wiki/GNU_C_C..._Architecture:

Quote:

In case of strcpy(), the compiler checks the size argument and uses the optimized built-in version of strcpy() if the argument is constant.

I'm not very familiar with sse instructions, but don't they work in blocks of 16 bytes at a time? I don't think they would be useful unless the string length is guaranteed to be divisible by 16.

Quote:

What happened to the two variables?

Basically you had a Finite state machine where the state was in dotsCount and hasNonDots variables. I just moved the state into the instruction pointer. Like going from 1.2 Automata based style program to 1.1 Traditional (imperative) program.

konsolebox · 08-18-2009, 12:10 PM

Quote:

Originally Posted by ntubski

The goto statement is not there for optimization, if that's what you're thinking. C lacks something like Java's labeled break, so I use goto with a comment. I don't think there is a clearer way of breaking out of nested loops.

Yes I also thought so as I was also looking for something like 'break 2'.

Quote:

Originally Posted by ntubski

According to http://en.wikibooks.org/wiki/GNU_C_C..._Architecture:
I'm not very familiar with sse instructions, but don't they work in blocks of 16 bytes at a time? I don't think they would be useful unless the string length is guaranteed to be divisible by 16.

Not sure if that's really the case since builtins of memcpy() and strlen() work and they don't depend on 16 bytes blocks.

Quote:

Originally Posted by ntubski

Basically you had a Finite state machine where the state was in dotsCount and hasNonDots variables. I just moved the state into the instruction pointer. Like going from 1.2 Automata based style program to 1.1 Traditional (imperative) program.

But still you made a big step for optimizing the code

.

konsolebox · 08-18-2009, 12:38 PM

Final prototype? Sorry I can't help but use gotos here

Code:

/* getabspath.c
 *
 * The program gets a path from an argument and prints its absolute form to the
 * standard output with no newline.  If no argument was passed, the program
 * prints the path of the current working directory.
 *
 * The code was created with speed in mind and is best compiled with -Os.
 *
 * Thanks go to the following people in linuxquestions.org:
 *   johnsfine for technical discussions regarding compilers esp. optimizations,
 *   SciYro, Sergei Steshenko, and catkin for comments and suggestions,
 *   wje_lq for referring links to helpful documents and
 *   ntubski for suggestions, simplifications and enhancements
 *
 * Arranged by konsolebox
 * Copyright Free / Public Domain
 * Aug. 17, 2009
 */


/* CONFIGURATIONS */

#define CONFIG_CWDBUFFERSIZE 512
#define CONFIG_MAXTOKENS 64
#define CONFIG_CHECKTOKENS

/* if set, gather all tokens to a single string then print it;
   write() or fwrite() will be used instead of fputs() */

#define CONFIG_ONESHOT

/* use fwrite() instead of write() */

#undef CONFIG_USEFWRITE

/* use fputs() instead of fwrite() or write() in static null-terminated strings */

#undef CONFIG_USEFPUTS

/* use variable length arrays instead of alloca() and malloc() */

#define CONFIG_USEVARLENGTHARRAYS

/* if CONFIG_USEVARLENGTHARRAYS is not set, use alloca() instead of malloc() */

#define CONFIG_USEALLOCA

/* if one of CONFIG_USEVARLENGTHARRAYS and CONFIG_USEALLOCA is set,
   check the size of the buffer before allocating */

#define CONFIG_MAXOUTPUTBUFFERSIZE 1024
#define CONFIG_CHECKOUTPUTBUFFERSIZE

/* use strcpy() instead of memcpy() */

#undef CONFIG_USESTRCPY


/* REFERENCE OF DEFAULTS */

// CONFIG_CWDBUFFERSIZE         512
// CONFIG_MAXTOKENS             64
// CONFIG_CHECKTOKENS           defined
// CONFIG_ONESHOT               defined
// CONFIG_USEFWRITE             undefined
// CONFIG_USEFPUTS              undefined
// CONFIG_USEVARLENGTHARRAYS    defined
// CONFIG_USEALLOCA             defined
// CONFIG_MAXOUTPUTBUFFERSIZE   1024
// CONFIG_CHECKOUTPUTBUFFERSIZE defined
// CONFIG_USESTRCPY             undefined


/* APPLICATION */

#include <unistd.h>  // getcwd(), write()

#if defined(CONFIG_ONESHOT)
#	include <string.h>  // strcpy(), memcpy()
#endif

#if (defined(CONFIG_ONESHOT) && !defined(CONFIG_USEVARLENGTHARRAYS) && !defined(CONFIG_USEALLOCA))
#	include <stdlib.h>  // malloc(), free()
#endif

#if (!defined(CONFIG_ONESHOT) || defined(CONFIG_USEFWRITE) || defined(CONFIG_USEFPUTS))
#	include <stdio.h>  // fputs(), fwrite(), stdout, stderr
#endif

#ifdef CONFIG_USEFWRITE
#	define STDOUT stdout
#	define STDERR stderr
#	define WRITE(F, T, S) fwrite((const void *) T, S, 1, F);
#else
#	define STDOUT 1
#	define STDERR 2
#	define WRITE(F, T, S) write(F, (const void *) T, S);
#endif

#ifdef CONFIG_USEFPUTS
#	define PRINT(T) fputs(T, stdout);
#	define REPORTFAILURE(T) fputs(T, stderr);
#else
#	define PRINT(T) WRITE(STDOUT, T, (sizeof(T) - sizeof((char) '\0')));
#	define REPORTFAILURE(T) WRITE(STDERR, T, (sizeof(T) - sizeof((char) '\0')));
#endif


int main (int argc, char ** argv) {
	register char * p;
	char * tokens [CONFIG_MAXTOKENS];
	unsigned int t;
	char * base;
	char cwd [CONFIG_CWDBUFFERSIZE];
	int endWithSlash;  // register_or_static int ...;

	/* just print the current directory if no argument was passed
       or if argument is empty */

	if (argc < 2 || *(base = argv[1]) == '\0') {
		if (getcwd(cwd, CONFIG_CWDBUFFERSIZE) == 0x00)
			goto failedToGetCwd;

		register unsigned int l = strlen(cwd);

		if (l == 1) {
			WRITE(STDOUT, "/.", 2);
			goto return0;
		}

		WRITE(STDOUT, cwd, l);
		goto return0;
	}

	/* split everything to tokens */

	{

#ifdef CONFIG_CHECKTOKENS
#	define TOKENSPLUS(T) \
			if (t == CONFIG_MAXTOKENS) \
				goto failOnTooManyTokens; \
			tokens[t++] = T;
#else
#	define TOKENSPLUS(T) \
			tokens[t++] = T;
#endif

		t = 0;
		endWithSlash = 1;

		p = base;
		register char c = *p;

		if (c != '/') {
			/* path is relative; include the current directory */

			if (getcwd(cwd, CONFIG_CWDBUFFERSIZE) == 0x00)
				goto failedToGetCwd;

			if (cwd[1]) {
				// push p; push c;

				tokens[0] = p = &cwd[1];
				++t;

				for (;;) {
					c = *++p;

					if (c == '\0')
						break;

					if (c != '/')
						continue;

					*p = '\0';

					/* since everything after '/' is always not '\0' ... */

					TOKENSPLUS(++p);

					continue;
				}

				/* would have been optional if another set of registers
				   was used above but it's ok */

				// pop c; pop p;

				p = base;
				c = *p;

				goto tokenLoop_dotCheck;
			}

			goto tokenLoop_dotCheck;
		}

	tokenLoop_start:

		c = *++p;

		if (c == '\0')
			goto tokenLoop_finish;

		if (c == '/')
			goto tokenLoop_start;

	tokenLoop_dotCheck:

		if (c == '.') {
			c = *++p;

			if (c == '/')
				goto tokenLoop_start;

			if (c == '\0')
				goto tokenLoop_finish_noEndSlash;

			if (c == '.') {
				c = *++p;

				if (c == '/') {
					if (t)
						--t;
					goto tokenLoop_start;
				}

				if (c == '\0') {
					if (t)
						--t;
					goto tokenLoop_finish_noEndSlash;
				}

				TOKENSPLUS(&p[-2]);

				goto tokenLoop_skipPlus;
			}

			TOKENSPLUS(&p[-1]);

			goto tokenLoop_skipPlus;
		}

		TOKENSPLUS(p);

	tokenLoop_skipPlus:

		do {
			c = *++p;

			if (c == '/') {
				*p = '\0';
				goto tokenLoop_start;
			}
		} while (c);

	tokenLoop_finish_noEndSlash:

		endWithSlash = 0;

	tokenLoop_finish:
		;
	}

	/* send to stdout */

	if (t) {
#if defined(CONFIG_ONESHOT)
		/* recombine all the tokens to an output buffer and print it */

#	if defined(CONFIG_USEVARLENGTHARRAYS)
		unsigned int tokens_length [t];

#	elif defined(CONFIG_USEALLOCA)
		unsigned int * tokens_length = alloca(t);

#	else
		unsigned int tokens_length [CONFIG_MAXTOKENS];

#	endif

		unsigned int i = 0;
		unsigned int outputBufferSize = 0;

		do {
			register unsigned int l = strlen(tokens[i]);
			tokens_length[i] = l;
			outputBufferSize += l;
		} while (++i < t);

#	ifdef CONFIG_USESTRCPY
		outputBufferSize += t + 2;
		  // t * sizeof((char) '/') [+ sizeof((char) '/')] + sizeof((char) '/0')
#	else
		outputBufferSize += t + 1;
		  // t * sizeof((char) '/') [+ sizeof((char) '/'])
#	endif

#if (defined(CONFIG_CHECKOUTPUTBUFFERSIZE) && (defined(CONFIG_USEVARLENGTHARRAYS) || defined(CONFIG_USEALLOCA)))
		if (outputBufferSize > CONFIG_MAXOUTPUTBUFFERSIZE)
			goto failOnLargeOutputBufferSize;
#endif

#	if defined(CONFIG_USEVARLENGTHARRAYS)
		char outputBuffer[outputBufferSize];

#	elif defined(CONFIG_USEALLOCA)
		char * outputBuffer = alloca(outputBufferSize);

#	else
#		define FLAG_USINGMALLOC

		char * outputBuffer;

		if ((outputBuffer = (char *) malloc(outputBufferSize)) == 0x00)
			goto failedToMallocOutputBuffer;
#	endif

		i = 0;
		p = outputBuffer;

		do {
			*p++ = '/';
			register unsigned int l = tokens_length[i];

#	ifdef CONFIG_USESTRCPY
			strcpy(p, tokens[i]);
#	else
			memcpy(p, tokens[i], l);
#	endif

			p += l;
		} while (++i < t);

		if (endWithSlash) {
			*p++ = '/';
		}

		WRITE(STDOUT, outputBuffer, (p - outputBuffer));

#	ifdef FLAG_USINGMALLOC
		free(outputBuffer);
#	endif

#else
		unsigned int i = 0;

		do {
			p = tokens[i];

			if (p == base) {
				fputs("/", stdout);
				fputs(p, stdout);
				continue;
			}

			--p;
			*p = '/';
			fputs(p, stdout);
		} while (++i < t);

		if (endWithSlash == 0)
			goto return0;

		fputs("/", stdout);
#endif

		goto return0;
	}

	if (endWithSlash) {
		PRINT("/");
		goto return0;
	}

	PRINT("/.");
	goto return0;

failedToGetCwd:
	REPORTFAILURE("failed to get current path.\n");
	goto return1;

failOnTooManyTokens:
	REPORTFAILURE("argument expands to too many tokens.\n");
#ifdef CONFIG_CHECKOUTPUTBUFFERSIZE
	goto return1;

failOnLargeOutputBufferSize:
	REPORTFAILURE("argument expands to too many tokens.\n");
#endif
#ifdef FLAG_USINGMALLOC
	goto return1;

failedToMallocOutputBuffer:
	REPORTFAILURE("unable to allocate space for output buffer\n");
#endif

return1:
	return 1;

return0:
	return 0;
}

I can't really use inlines as they are not supported by default in compilers. It's better this way and it's still the same anyway so it's ok.

Thanks for the help everyone. Finally. Now this is what I meant

.

ntubski · 08-18-2009, 06:19 PM

Quote:

Not sure if that's really the case since builtins of memcpy() and strlen() work and they don't depend on 16 bytes blocks.

My point was not that they can't be inlined unless 16 byte blocks are used, my point was that the inlined code probably won't make use of sse instructions since it has no guarantees that the data is in 16 byte blocks. And when I lookat the code generated by the __builtin_strlen and __builtin_memcpy they don't use sse instructions. I also noticed that __builtin_memcpy wasn't expanded unless I passed a constant for the length argument.

Quote:

* The code was created with speed in mind and is best compiled with -Os.

But -Os prefers small size over speed...

Quote:

Sorry I can't help but use gotos here

I think you've made things needlessly obfuscated, and I'm skeptical as to whether there are any speed benefits from this. Since the program finishes so quickly there's no way to perform useful benchmarks anyways.

Quote:

Now this is what I meant

.

This code just makes me sad.

konsolebox · 08-18-2009, 11:55 PM

Quote:

Originally Posted by ntubski

My point was not that they can't be inlined unless 16 byte blocks are used, my point was that the inlined code probably won't make use of sse instructions since it has no guarantees that the data is in 16 byte blocks. And when I lookat the code generated by the __builtin_strlen and __builtin_memcpy they don't use sse instructions. I also noticed that __builtin_memcpy wasn't expanded unless I passed a constant for the length argument.

I saw it expand with an instruction like REPZ? Is it not a SSE instruction?

Quote:

Originally Posted by ntubski

But -Os prefers small size over speed...

The gotos are there so that there will be the same output as -O2 even though -O2 is not used. With this, the code will no longer be dependent to optimizations. I would have still preferred -O2 but there is a problem with it. I think it should no longer repeat same sequences of return statements as it is not really critical with speed; it will only make the code bigger. -Os here in this program I think will mean more speed and not lesser size.

Quote:

I think you've made things needlessly obfuscated, and I'm skeptical as to whether there are any speed benefits from this. Since the program finishes so quickly there's no way to perform useful benchmarks anyways.

This code just makes me sad.

I hope you reconsider this with my explanation above. I just want to make it *explicitly* fast as much as possible and not depend with compilers. If it's wrong, I'll try revising the code again.

ntubski · 08-20-2009, 01:49 PM

Quote:

Originally Posted by konsolebox

I saw it expand with an instruction like REPZ? Is it not a SSE instruction?

nope.

Quote:

I just want to make it *explicitly* fast as much as possible and not depend with compilers.

You're solving a problem you don't have: the entire program takes close to 0 seconds to run, these "optimizations" can therefore at most save roughly 0 seconds of runtime, so it's obviously a bad tradeoff. Furthermore, since you can't measure accurately it's even possible that you have created a slower program.

konsolebox · 08-21-2009, 03:42 AM

Quote:

Originally Posted by ntubski

nope.

But how come it's available in -march=i686 and -march=pentium4 and not in generic? Regardless, memcpy and strlen still expands.

Quote:

Furthermore, since you can't measure accurately it's even possible that you have created a slower program.

Like in what part of the code was not accurate? We can always make it better if that's the only problem.

Quote:

You're solving a problem you don't have: the entire program takes close to 0 seconds to run, these "optimizations" can therefore at most save roughly 0 seconds of runtime, so it's obviously a bad tradeoff.

I think I'm no longer being reasonable. Perhaps. But we really can't be sure if the optimization of the program is not going to give any benefit since the program will be called many times in my script.

But regardless of that, I don't think that there's something really wrong that's going to happen if I don't revoke what it was that I have decided. I still think that it's just better to finish what it was that I started with what I have decided.

ntubski · 08-21-2009, 05:41 PM

Quote:

But how come it's available in -march=i686 and -march=pentium4 and not in generic? Regardless, memcpy and strlen still expands.

I guess various instructions have different efficiency depending on the cpu.

Code:

$ cat builtins.c
#include <stdio.h>

char str[80], str2[80];
int len;

int main()
{
    fgets(str, sizeof str, stdin);

    len = __builtin_strlen(str);
    __builtin_memcpy(str, str2, len);
    return 0;
}
$ gcc -g -march=pentium4 builtins.c -o builtins-pentium4 -O3
$ gcc -g -march=i386 builtins.c -o builtins-386 -O3
$ gcc -g -march=i686 builtins.c -o builtins-686 -O3
$ objdump --prefix-addresses -S builtins-pentium4 | less
...
    len = __builtin_strlen(str);
08048453 <main+0x2f> mov    $0x80496a0,%edi
08048458 <main+0x34> xor    %eax,%eax
0804845a <main+0x36> mov    $0xffffffff,%ecx
0804845f <main+0x3b> repnz scas %es:(%edi),%al
08048461 <main+0x3d> not    %ecx
08048463 <main+0x3f> sub    $0x1,%ecx
08048466 <main+0x42> mov    %ecx,0x8049680
    __builtin_memcpy(str, str2, len);
0804846c <main+0x48> mov    %ecx,0x8(%esp)
08048470 <main+0x4c> movl   $0x8049700,0x4(%esp)
08048478 <main+0x54> movl   $0x80496a0,(%esp)
0804847f <main+0x5b> call   08048358 <memcpy@plt>
...
$ objdump --prefix-addresses -S builtins-386 | less
...
    len = __builtin_strlen(str);
08048419 <main+0x25> mov    $0x8049660,%edx
0804841e <main+0x2a> mov    %edx,%edi
08048420 <main+0x2c> xor    %eax,%eax
08048422 <main+0x2e> mov    $0xffffffff,%ecx
08048427 <main+0x33> repnz scas %es:(%edi),%al
08048429 <main+0x35> not    %ecx
0804842b <main+0x37> dec    %ecx
0804842c <main+0x38> mov    %ecx,0x8049640
    __builtin_memcpy(str, str2, len);
08048432 <main+0x3e> mov    $0x80496c0,%esi
08048437 <main+0x43> mov    %edx,%edi
08048439 <main+0x45> rep movsb %ds:(%esi),%es:(%edi)
...
$ objdump --prefix-addresses -S builtins-686 | less
...
    len = __builtin_strlen(str);
0804845e <main+0x2e> mov    $0x80496e0,%ecx
08048463 <main+0x33> mov    (%ecx),%eax
08048465 <main+0x35> add    $0x4,%ecx
08048468 <main+0x38> lea    -0x1010101(%eax),%edx
0804846e <main+0x3e> not    %eax
08048470 <main+0x40> and    %eax,%edx
08048472 <main+0x42> and    $0x80808080,%edx
08048478 <main+0x48> je     08048463 <main+0x33>
0804847a <main+0x4a> mov    %edx,%eax
0804847c <main+0x4c> shr    $0x10,%eax
0804847f <main+0x4f> test   $0x8080,%edx
08048485 <main+0x55> cmove  %eax,%edx
08048488 <main+0x58> lea    0x2(%ecx),%eax
0804848b <main+0x5b> cmove  %eax,%ecx
0804848e <main+0x5e> add    %dl,%dl
08048490 <main+0x60> sbb    $0x3,%ecx
08048493 <main+0x63> sub    $0x80496e0,%ecx
08048499 <main+0x69> mov    %ecx,0x80496c0
    __builtin_memcpy(str, str2, len);
0804849f <main+0x6f> mov    %ecx,0x8(%esp)
080484a3 <main+0x73> movl   $0x8049740,0x4(%esp)
080484ab <main+0x7b> movl   $0x80496e0,(%esp)
080484b2 <main+0x82> call   08048358 <memcpy@plt>
...

konsolebox · 08-29-2009, 10:00 AM

Quote:

Originally Posted by ntubski

Why don't you ask for help optimizing the shell script...

Just made the first upload of the scripts. You might want to take a look at it here: http://sourceforge.net/projects/loader/.

ntubski · 08-29-2009, 12:38 PM

Whoa

There's a lot of code there, I sincerely doubt that replacing the abspath functions will make a noticeable difference, but it's hard to say without some typical input files to test things with. If you are serious about optimizing I would go back to my earlier suggestion: rewrite the program to handle multiple paths per run, perhaps using a fifo or socket.

I'm wondering why you have a hashing function in your compiler, aren't all awk arrays hash tables already?