LinuxQuestions.org

LinuxQuestions.org (/questions/)
-   Programming (https://www.linuxquestions.org/questions/programming-9/)
-   -   need help to further *optimize* my small C program that produces absolute path (https://www.linuxquestions.org/questions/programming-9/need-help-to-further-%2Aoptimize%2A-my-small-c-program-that-produces-absolute-path-744726/)

konsolebox 08-03-2009 06:22 AM

need help to further *optimize* my small C program that produces absolute path
 
I need to further optimize this C program of mine that produces the absolute path of any path that is passed as an argument to it. All kinds of path are accepted (including but not only both absolute but indirect and relative paths). If the entered path ends with a slash ('/'), the output will also end in a slash or else it ends with the name of the last directory or "/.".

Here's the code.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKMAXTOKENS

void reportMaxTokensReached (void);

int main (int argc, char const ** argv) {
        register char * p;

        char * workBuffer;
        unsigned int workBufferSize;

        char * token;
        char * tokens [MAXTOKENS];
        unsigned int t;

        char cwd [CWDBUFFERSIZE];
        unsigned int cwdLength;

        char const * sourceArg;
        unsigned int sourceArgSize;

        unsigned int dotsCount;
        unsigned int hasNonDots;
        unsigned int isRelative;

        unsigned int i;

        if
        (
                argc == 1
        ||
                // We place the assignments here because we won't need the parameters assigned
                // if 'argc == 1'.
                //
                // I hope also that there's a way to make sourceArgSize be immediately set to
                // +1 of strlen(...) without changing the value that's comparable to 0 but I
                // can't find a way.  Perhaps C is just C and that's it.
                //
                (sourceArgSize = (strlen(sourceArg = argv[1]))) == 0
        )
        {
                if (getcwd(cwd, CWDBUFFERSIZE)) {
                        puts((char const *) cwd);
                        return 0;
                } else {
                        return 1;
                }
        }

        workBufferSize = ++sourceArgSize;

        if ((isRelative = (*sourceArg != '/'))) {
                if (getcwd(cwd, CWDBUFFERSIZE)) {
                        workBufferSize += (cwdLength = strlen(cwd));
                } else {
                        return 1;
                }
        }

        if ((workBuffer = (char *) malloc(workBufferSize)) == 0x00) {
                fprintf(stderr, "failed to allocate memory.\n");
                return 1;
        }

        if (isRelative) {
                memcpy(workBuffer, cwd, cwdLength);

                p = workBuffer + cwdLength;

                *(p++) = '/';

                memcpy(p, sourceArg, sourceArgSize);
        } else {
                memcpy(workBuffer, sourceArg, sourceArgSize);
        }

        dotsCount = 0;
        hasNonDots = 0;
        t = 0;

        for (p = token = workBuffer; *p; ++p) {
                switch (*p) {
                case '/':
                        if (hasNonDots) {
                                #ifdef CHECKMAXTOKENS
                                if (t == MAXTOKENS) {
                                        reportMaxTokensReached();
                                        return 1;
                                }
                                #endif

                                tokens[t++] = token;
                        } else if (dotsCount) {
                                switch (dotsCount) {
                                case 1:
                                        break;
                                case 2:
                                        if (t)
                                                --t;
                                        break;
                                default:
                                        #ifdef CHECKMAXTOKENS
                                        if (t == MAXTOKENS) {
                                                reportMaxTokensReached();
                                                return 1;
                                        }
                                        #endif

                                        tokens[t++] = token;
                                }
                        }

                        dotsCount  = 0;
                        hasNonDots = 0;
                        *p = '\0';

                        token = p; ++token;        // token = p + 1;
                                            // OR 'token = ++p;' but will increase size of loop block, faster processing?
                        continue;
                case '.':
                        ++dotsCount;
                        continue;
                default:
                        ++hasNonDots; // faster than 'hasNonDots = 1;' ?
                }
        }

        if (hasNonDots) {
                #ifdef CHECKMAXTOKENS
                if (t == MAXTOKENS) {
                        reportMaxTokensReached();
                        return 1;
                }
                #endif

                tokens[t++] = token;
        } else if (dotsCount) {
                switch (dotsCount) {
                case 1:
                        break;
                case 2:
                        if (t)
                                --t;
                        break;
                default:
                        #ifdef CHECKMAXTOKENS
                        if (t == MAXTOKENS) {
                                reportMaxTokensReached();
                                return 1;
                        }
                        #endif

                        tokens[t++] = token;
                }
        }

        if (*(--p)) {
                // last character is not a forward slash ('/')

                if (t) {
                        for (i = 0; i < (const unsigned int) t; i++) {
                                putchar('/');
                                fputs(tokens[i], stdout);
                        }

                        puts("");
                } else {
                        puts("/.");
                }
        } else {
                if (t) {
                        for (i = 0; i < (const unsigned int) t; i++) {
                                putchar('/');
                                fputs(tokens[i], stdout);
                        }
                }

                puts("/");
        }

        return 0;
}

void reportMaxTokensReached (void) {
        printf("path has too many tokens.\n");
}

I need to optimize this without being dependent to the compiler and also not being dependent to asm codes as although I want the code to be currently optimized for x86+ systems, I still want it to be easily portable to other systems if required later.

Thanks in advance for any help.

ntubski 08-03-2009 08:27 PM

Why do you need to optimize it? If you are only converting 1 path it seems to me that this will be fast enough (as in, less than an eye blink). If you are converting many paths probably more time is being wasted in process startup than actual runtime of your code, I would suggest changing your program to convert multiple paths per run.

Quote:

Code:

  if
        (
        argc == 1
        ||
        // We place the assignments here because we won't need the parameters assigned
        // if 'argc == 1'.
        //
        // I hope also that there's a way to make sourceArgSize be immediately set to
        // +1 of strlen(...) without changing the value that's comparable to 0 but I
        // can't find a way.  Perhaps C is just C and that's it.
        //
        (sourceArgSize = (strlen(sourceArg = argv[1]))) == 0
        )


Putting complicated expressions into an if is bad style. Are you afraid of using too many variables? An optimizing compiler won't waste any more memory on them.

Quote:

token = p; ++token;// token = p + 1;
// OR 'token = ++p;' but will increase size of loop block, faster processing?
I would expect the first 2 to produce the same code, but I prefer the 2nd since it is simpler. I'm not sure what your comment about the 3rd means, but it looks to me like it would cause the skipping of a character and therefore is simply incorrect.

Quote:

++hasNonDots; // faster than 'hasNonDots = 1;' ?
I would expect the "= 1" to be faster since it doesn't have to read the variable first. Also the adding to the variable every time could theoretically lead to overflow (okay, various system limitations would prevent the string from ever being that long, but I think "= 1" is more correct).

Quote:

Code:

            if (hasNonDots) {
#ifdef CHECKMAXTOKENS
                if (t == MAXTOKENS) {
                    reportMaxTokensReached();
                    return 1;
                }
#endif

                tokens[t++] = token;
            } else if (dotsCount) {
                switch (dotsCount) {
                case 1:
                    break;
                case 2:
                    if (t)
                        --t;
                    break;
                default:
#ifdef CHECKMAXTOKENS
                    if (t == MAXTOKENS) {
                        reportMaxTokensReached();
                        return 1;
                    }
#endif

                    tokens[t++] = token;
                }
            }


This is somewhat long and confusing, how about
Code:

          if (!hasNonDots && dotsCount == 2 && t > 0) {
              --t;            /* ".." means go up one directory */
          } else if (hasNonDots || dotsCount >= 3) {
              CHECK_TOKENS(t);
              tokens[t++] = token;
          }

And since this is duplicated after the loop, you should put it into a function (with inlining that shouldn't slow anything down).
CHECK_TOKENS is #defined as:
Code:

#ifdef CHECKMAXTOKENS
static void reportMaxTokensReached (unsigned int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

Quote:

if (*(--p)) {
Why have side effects when you don't use them? I would prefer *(p-1) or p[-1].

Quote:

i < (const unsigned int) t;
I don't see what that cast is good for, t is already an unsigned int, casting an LR-value to const doesn't change anything...

Sergei Steshenko 08-03-2009 09:57 PM

The good question would be: "Why to optimize at all in this case ?". Since code deals with file paths, most likely it will be called between file operations, which are much slower than the code anyway.

konsolebox 08-04-2009 02:34 AM

Quote:

Originally Posted by ntubski (Post 3630259)
Why do you need to optimize it? If you are only converting 1 path it seems to me that this will be fast enough (as in, less than an eye blink). If you are converting many paths probably more time is being wasted in process startup than actual runtime of your code, I would suggest changing your program to convert multiple paths per run.

First, thanks for the big reply.

Regarding the conversion of multiple paths at once, it won't be applicable for my purpose since the program will be mostly used in a script that will require only one conversion per call.

Now regarding about the speed, it's probably already fast enough but I just want it be in its fullest. Not really a reason perhaps. Just my taste. After all the shell script I'm creating is time critical and I feel that it's still slow.

I was also thinking about the process startup that probably also includes loading of the code. I think it's really dependent on the kind of system and also, small code doesn't really make much speed gain than a larger code when it comes to loading since io functions mostly occur one and a time and a 512K code won't make much difference with a 1024K code.

Quote:

Putting complicated expressions into an if is bad style. Are you afraid of using too many variables? An optimizing compiler won't waste any more memory on them.
I see no other way to put the code and no, I'm not afraid to sacrifice memory more likely if it's for the sake of speed. After all, in most compilers local variables are allocated by just decrementing the stack pointer to reserve a space in the stack.

It's not about additional memory usage but additional assignment statements.

Quote:

I would expect the first 2 to produce the same code, but I prefer the 2nd since it is simpler.
About why I'd prefer ++ instead of + 1, the thing is just in the expected compilation output. ADD is slower and bigger than INC right?

Quote:

I'm not sure what your comment about the 3rd means, but it looks to me like it would cause the skipping of a character and therefore is simply incorrect.
Of course with that you'll expect to remove the ++p instruction in the for loop header and include it in all the inclusion blocks.

Quote:

I would expect the "= 1" to be faster since it doesn't have to read the variable first. Also the adding to the variable every time could theoretically lead to overflow (okay, various system limitations would prevent the string from ever being that long, but I think "= 1" is more correct).
Again it's about the use of ADD or INC. Also, as I've said, I don't want to be dependent of the compiler. Not all compilers might recognize + 1 as just an increment and use INC and not ADD in the output form.

Quote:

This is somewhat long and confusing, how about
Code:

          if (!hasNonDots && dotsCount == 2 && t > 0) {
              --t;            /* ".." means go up one directory */
          } else if (hasNonDots || dotsCount >= 3) {
              CHECK_TOKENS(t);
              tokens[t++] = token;
          }

And since this is duplicated after the loop, you should put it into a function (with inlining that shouldn't slow anything down).
Speed and logic first before readability or design but with proper balance with design only if possible. That's my approach to this program. I want to optimize this program in its best as a whole and I think separating optimizations on another "inline" might just make it slower.

Quote:

CHECK_TOKENS is #defined as:
Code:

#ifdef CHECKMAXTOKENS
static void reportMaxTokensReached (unsigned int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif


No I don't want to do that. Non-terminating function calls in a loop will just make things slower.
Quote:

Why have side effects when you don't use them? I would prefer *(p-1) or p[-1].
Just like ADD and INC, it's only about the difference of SUB and DEC. Also with p[-1], the pointer would probably first be transferred to another variable before deducting it. Probably not true all the time if the system or the compiler can use offsets but the one I use sounds faster for most situations.
Quote:

I don't see what that cast is good for, t is already an unsigned int, casting an L-value to const doesn't change anything...
Just tell the compiler not to check the variable again and again on the loop. More likely tell the compiler to put store the value first to a faster location. Not all compilers are dumb or recognizes this but it might just be helpful.

In any way, thanks again.

konsolebox 08-04-2009 02:45 AM

Quote:

Originally Posted by Sergei Steshenko (Post 3630304)
The good question would be: "Why to optimize at all in this case ?". Since code deals with file paths, most likely it will be called between file operations, which are much slower than the code anyway.

I'll be using it in a shell script program that manipulates paths one at a time. The script is still slow and probably there's no other way to make it faster because of dependencies and limitations of shells but just in case that a little additional speed will help...

SciYro 08-04-2009 02:55 AM

I have a few comments on how to make it faster I believe:

1) Dont optimize, and fix your code. It really is a mess, and optimizing by guessing usually has the reverse effect. Instead, make your algorithims better and stop worrying about squezing all the CPU out of it.

2) As a side note, using a switch statement usually produces less effiecent code for the CPU, using a switch statement in a loop while trying to make that loop faster is probably bad.

3) Stop making so many library calls. You seem to be making calls to do output left and right, when each call has a overhead. Instead, try to only make one output call when you know what to output, that should make things a bit faster.

konsolebox 08-04-2009 03:18 AM

Quote:

Originally Posted by SciYro (Post 3630479)
I have a few comments on how to make it faster I believe:

1) Dont optimize, and fix your code. It really is a mess, and optimizing by guessing usually has the reverse effect. Instead, make your algorithims better and stop worrying about squezing all the CPU out of it.

I make clean and readable code when I program non-speed-critical programs like C++ OOPs but not this time. I already decided to chose speed before readability of code logic and to me this is not really mind-squeezing compared to other programs I studied or made so I guess I'll just have to finish this. Besides, I always can create another replica of this in a easier and cleaner form if I like, can't I.

Quote:

2) As a side note, using a switch statement usually produces less effiecent code for the CPU, using a switch statement in a loop while trying to make that loop faster is probably bad.
I haven't yet focused in analyzing the program using gdb and I'll be doing that later. I'll see what really happens then. Nice suggestion btw.

Quote:

3) Stop making so many library calls. You seem to be making calls to do output left and right, when each call has a overhead. Instead, try to only make one output call when you know what to output, that should make things a bit faster.
Probably the real reason why I posted this code. I don't want to make multiple library calls but I still can't tell how I'll be able to manipulate the tokens to make them combine into one whole string and make only one library-system call to print it. I could use similar other routines like memcpy but isn't that a library call as well? This time I don't know what's best to do. It seems that asm is the only thing that can really make it faster. Or are there some things that I don't know. Should I make my own routines? Wouldn't that be dependent to a limited number of systems?

catkin 08-04-2009 03:20 AM

A 50% time saving in part of the system that runs 2% of the time saves 1%; a 25% time saving in part of the system that runs 80% of the time saves 20%. Work smart, not hard! Identify the bottlenecks and optimise those; thg rest is of little consequence. Unless, of course, you are doing it for the pleasure in which case do what you like!

konsolebox 08-04-2009 03:30 AM

Quote:

Originally Posted by catkin (Post 3630502)
A 50% time saving in part of the system that runs 2% of the time saves 1%; a 25% time saving in part of the system that runs 80% of the time saves 20%. Work smart, not hard! Identify the bottlenecks and optimise those; thg rest is of little consequence. Unless, of course, you are doing it for the pleasure in which case do what you like!

Your suggestion is a little bit confusing but I like it. I think I know what you mean but I just follow the requirement of my program. Not really have to be more than that. Just the requirement :).

Of course there's also a bit of pleasure or else it will all only be nothing but pain. Not all but don't most hackers do take pleasure with their programming? ^^,

wje_lq 08-04-2009 04:37 AM

The prudent programmer, before optimizing anything, is advised to browse here , which is a very short read, and here, which is a very fun one.

SciYro 08-04-2009 04:47 AM

By 'library calls' I ment calls that went to the kernel, such as outputting text. String/array calls should be fine, and probably faster then a equivelent you can come up with.

As for your insistence upon speed, I gave your program a pass thru with 'time', and I see no real diffrence between your code and the command 'readlink -f'. Naturally, with such a small program doing such a simple job, itll probably take longer to start your program then to run it. I dont think optimizing this program is your best option for making things run faster, or that there is much you can do to optimize things as they are.

Sergei Steshenko 08-04-2009 04:51 AM

Quote:

Originally Posted by konsolebox (Post 3630468)
...
After all the shell script I'm creating is time critical and I feel that it's still slow.
...

The rest is irrelevant. Shells ('bash', 'csh', 'tcsh') interpret your script line by line, each time reparsing the line being interpreted, so this is what takes most of the resources.

If your shell script feels slow, either switch to a better language (Perl/Python) or write your whole program in C/C++.

And don't estimate modern compiler optimization strengths - it does really smart things, and it does them even when you use, say, array notation with indexes and pointer one - the former is safer overall.

You are wasting your time optimizing this "C" code.

johnsfine 08-04-2009 08:32 AM

Quote:

Originally Posted by konsolebox (Post 3630468)
Now regarding about the speed, it's probably already fast enough but I just want it be in its fullest.

I think it is a useful exercise for learning purposes to optimize code that doesn't need it. But you still should have some better perspective about which aspects you're intentionally ignoring vs. which you're misunderstanding.

The load time will far exceed the execution time, so none of optimizations will have visible effect and if you wanted effective optimization you would focus entirely on reducing the size and the external linkage. But I think it is fair to ignore that.

Significant parts of the code are only executed once, so their execution time is dominated by the cache misses of the first execution. So the speed of the instructions doesn't matter. The size matters and to a lessor extent the first pass branch mispredictions matter. Both can be improved by (among other things) doing some tail merges better by hand than the compiler can. Do you want to ignore the performance issues of "one time" code (any code that is executed so rarely that it is dominated by cache misses)? Lots of big programs have their total execution time dominated by such code (not actually one time code, but effectively one time code because it is always dropped from the cache before being executed again). A performance fanatic should learn how to optimize that code differently than inner loop code. For example
Code:

                memcpy(p, sourceArg, sourceArgSize);
        } else {
                memcpy(workBuffer, sourceArg, sourceArgSize);
        }

The compiler can do only a very partial tail merge there. You could do a complete tail merge (probably at the cost of one extra mov instruction in the final executed code). In inner loop code the extra mov might balance the savings of one well predicted branch and your manual tail merge nets to no benefit. In code dominated by cache misses the manual tail merge is a big win and the extra mov is trivial. You should understand the difference between the code that is fetched and the code that is executed and which matters when.

Quote:

About why I'd prefer ++ instead of + 1, the thing is just in the expected compilation output. ADD is slower and bigger than INC right?
Do you know x86 assembler? Have you looked at the generated code in x86 assembler.

ADD is bigger (but not inherently slower) than INC. Sometimes (such as with cache misses) bigger itself means slower. Sometimes it doesn't. The compiler rarely generates INC or DEC instructions, even if they would be better and even if you have distorted your code to bias toward them. Your several distortions toward INC each either make no difference or make the generated code worse. If you insist on controlling the asm code you have to write asm code.
A=B+1; will always generate code at least as good as A=B;++A;. Usually they will be the same. Occasionally the second will be worse. If the compiler would be willing to use INC for the second, it will also use INC for the first.
When you know X was zero (so you have this choice) ++X; in asm code will sometimes be faster than X=1; (less often than you think). But the C compiler will almost never let you achieve that optimization. In C code X=1; will usually be faster, including situations where ++X; would be faster in asm code.

Quote:

Again it's about the use of ADD or INC. Also, as I've said, I don't want to be dependent of the compiler. Not all compilers might recognize + 1 as just an increment and use INC and not ADD in the output form.
You can't micro optimize C code without a particular compiler and architecture in mind. If you want code to be fast in some as yet unknown compiler and architecture you're more likely to get it right by just coding what you mean: X=1; if that is what you mean, A=B+1; if that is what you mean.

Quote:

I want to optimize this program in its best as a whole and I think separating optimizations on another "inline" might just make it slower.
Compilers do a good job of inlining in a way that allows optimization across the boundary of the calling and called code. If defining and using an inline function makes your code more readable, do it. It won't make the code slower.

Quote:

Also with p[-1], the pointer would probably first be transferred to another variable before deducting it.
I didn't search for that spot in your code, but you are betraying a serious lack of understanding of x86 asm code. The addressing modes are very powerful and p[-1] would typically be done without any extra instruction for even the -1, much less transferring p to another register.

Consider x=p-1; ... *x ...; If x is not used after that, the compiler will probably figure out you mean ... p[-1] ...; and generate exactly the same optimal code. If you use x later or involve a -- instead of the -1 you run a significant chance of confusing the compiler and generating worse code.

Sometimes a DEC instruction would be faster than letting the compiler use the advanced addressing mode. I've tried to micro optimize that case. You can't win, short of writing the asm code yourself. Either you confuse the compiler enough that it doesn't use the advance addressing mode but does something else much worse than the DEC, or you fail to confuse it and it does the advanced addressing mode. But I don't think you would even recognize the case where the DEC would be better. It isn't the case you probably expect. Don't try to micro optimize unless you know more than the compiler writer knew.

konsolebox 08-04-2009 09:12 AM

Wow there's nothing much I can say. Thank you very much for this great info. You've been frank but informative and that's what I really needed. It's kind of not easy for to be online this time but I'll save this page and think about all the things you said and decide what's best for my program for the next following days. I'll also consider the capabalities of modern compilers. I'll make a report once I'm finally done.

konsolebox 08-04-2009 09:20 AM

Quote:

Originally Posted by Sergei Steshenko (Post 3630576)
The rest is irrelevant. Shells ('bash', 'csh', 'tcsh') interpret your script line by line, each time reparsing the line being interpreted, so this is what takes most of the resources.

If your shell script feels slow, either switch to a better language (Perl/Python) or write your whole program in C/C++.

No I can't switch because the script is really meant for shell scripting and not on any other language and the only thing left to make it quicker is to create a program like this. I know that it won't really make much difference but I'm just trying.
Quote:

And don't estimate modern compiler optimization strengths - it does really smart things, and it does them even when you use, say, array notation with indexes and pointer one - the former is safer overall.

You are wasting your time optimizing this "C" code.
I know that and I know how compilers optimize code as I've already seen the output. Yes I know it might only make the code less understood by the compiler and make it harder to optimize but I'm still learning anyway and I haven't known what is the true thing that happens. I won't feel comfortable unless I really knew it myself.

Clarity: no I don't actually know how compilers work. I've only the difference seen the outputs that varies depending on the optimization level used. :-). I'm no genius. :D

konsolebox 08-04-2009 09:21 AM

Again guys if you find a good alternative to the library calls in the code just tell me. Asm is a good option but not yet. I want to know what's best to have it done.. in C. Thanks again all.

johnsfine 08-04-2009 09:40 AM

Quote:

Originally Posted by SciYro (Post 3630479)
2) As a side note, using a switch statement usually produces less effiecent code for the CPU, using a switch statement in a loop while trying to make that loop faster is probably bad.

If you need the actual more than two way dispatch, switch might be the most speed optimal way to do it and it is usually the most readable.

I'm not sure whether you are suggesting redesign to avoid needing a three way branch or you are suggesting that two if statements would be faster for a three way branch (it usually isn't).

I haven't looked carefully enough to be certain but contrast
Code:

                switch (*p) {
                case '/':  ...
                case '.':  ...
                default:  ... }

with
Code:

          switch (dotsCount) {
                case 1:  ...
                case 2:  ...
                default:  ... }

The first of those looks fundamental to the underlying problem and there is nothing the programmer knows about this three way switch that the compiler doesn't know. So I don't think you could code anything better than the switch.
In the second case the programmer may know things about the likely or possible values that would allow a double if statement to be faster than a switch, and the whole three way branch might not be as fundamental to the problem. So I'm not sure, but maybe something faster than the switch could be coded.

A switch is often less efficient than if statements because the compiler is usually forced to generate a "default" path even if you don't make one explicit and even if the problem doesn't require it. But in these examples the problem does require a default path and it is explicit in the source code.

ntubski 08-04-2009 08:38 PM

Quote:

Originally Posted by konsolebox (Post 3630473)
I'll be using it in a shell script program that manipulates paths one at a time. The script is still slow and probably there's no other way to make it faster because of dependencies and limitations of shells but just in case that a little additional speed will help...

Why don't you ask for help optimizing the shell script...

Quote:

Originally Posted by SciYro
By 'library calls' I ment calls that went to the kernel, such as outputting text. String/array calls should be fine, and probably faster then a equivelent you can come up with.

The libc I/O routines do buffering in order to avoid multiple system calls. The output of this program is usually pretty short, there is probably only a single system I/O call.

konsolebox 08-07-2009 12:43 PM

Quote:

Originally Posted by ntubski (Post 3631636)
Why don't you ask for help optimizing the shell script...

Lately I've been busy finishing the whole project. Ok I'll think about posting the code and asking for help when I'm finished. It's just too bad that I still can't decide what's the best license for the code. Maybe not use a license at all and use CC0. Or just send it to the public domain. Right now I still can't find any license that gives total safety for the software and freedom for the users. Maybe there's none.
Quote:

Originally Posted by ntubski (Post 3631636)
The libc I/O routines do buffering in order to avoid multiple system calls. The output of this program is usually pretty short, there is only probably only a single system I/O call.

Now I guess that's the end of the line. Perhaps there's nothing more I can do to make the code faster.


By the way I made some comparisons with my original code and a version of the code where I made modifications that includes 'p[-1]' and 'token = p + 1'.

The gcc compiler (verson 4.1.2) generates the same output from both codes when optimization flag is enabled.

But when the outputs are not optimized, the modified version appeared to be faster. Both still have the same number of instructions and both also use increment statements (INCL and INC) but there was a difference in size and the targets. The compiler understood my original code as something that implies direct modification to the memory (like INCL) and not something that uses the registers first. With this form, it's probably slower right? It was probably quicker if the compiler did not recognize p + 1 as "INC AX".

You guys are right. Maybe I really shouldn't over-underestimate compilers.

P[-1] by the way also appears to yield faster code. ntubski and johnsfine was right all along. I'll change my code with those and have it finalized. Thanks for the help guys. I'll take a note of all the things you posted here.


And by the way I still can't understand some of johnsfine's statements. I know the concept of caches but what are cache misses and what's a partial tail merge? :)

Maybe I'll study those when I have time and after I finish the project. The first working prototype of its first and maybe final protocol (at last after many attempts to make it complete and perfect according to its requirement) is soon to be finished. ;)

johnsfine 08-07-2009 01:19 PM

Quote:

Originally Posted by konsolebox (Post 3634945)
I know the concept of caches but what are cache misses

Any time the thing you want doesn't happen to be in the cache and must be brought in from slower memory.

Quote:

what's a partial tail merge?
Both compilers (advanced optimization switches) and compiler theory documents are inconsistent. But two related concepts use the term "tail merge".

1) Flow of control. If the last action before a point where flow of control joins is identical on both sides, the compiler can reduce code size and maybe cache misses by joining earlier than the source code indicates. This is most common and most obvious in the join after if/then/else.
Compare:
Code:

if (a) { foo(x,y,z); } else { p=something; foo(p,y,z); }
vs.
Code:

p=x; if (!a) { p=something; } foo(p,y,z);
If those two sequences have the same net effect, the programmer probably knows it, but the compiler might not understand later uses of p well enough to know it, but anyway probably can't make that transformation.

In the first case, the last operation before joining is calling foo() so the compiler could join it early, but the benefits are tiny and an optimizer would be unlikely to guess joining early was right.

Since parameters are normally set up right to left, there is a more obvious early join in
Code:

if (a) { foo(x,y,z); } else { p=something; foo(x,y,p); }
The compiler can join after setting up z or p and use common code to set up y and x and then call. That is something the compiler could optimize but the programmer couldn't.

Now consider
Code:

p=x; if (a) {foo(p,y,z);} else { p=something; foo(p,y,z);}
Now the compiler can see the whole potential merge easily and can even see how to save a branch instruction by merging away the whole then part. But usually if you want to go that far to help the optimizer you would go the rest of the way and merge it yourself.

2) The other meaning of "tail merge" is when the last operation of a function (before returning) is to call a function with a compatible parameter stack and the same return value (turns out to be very common in advanced C++ programming). The compiler can change the code to jump to the other function instead of calling it, and thus save the extra layer of return.

konsolebox 08-08-2009 08:14 AM

Got it. I now know what you mean. It seems that there are parts of the code that only compilers can optimize.

By the way, regarding my second switch statement, do you think it's better if I just do something like:
Code:

} else if (dotsCount && dotsCount != 1) {
  if (dotsCount == 2) {
    ...
  } else {
    ...
  }
}

or

Code:

} else if (dotsCount) {
  if (dotsCount != 1) {
    if (dotsCount == 2) {
      ...
    } else {
      ...
    }
  }
}

?

johnsfine 08-08-2009 08:30 AM

I don't feel like studying your source code enough to decide the optimal way to branch on dotsCount. But I doubt that either of the above would be better than just using a switch.

What's wrong with
Code:

} else if (dotsCount >= 2) {
  if (dotsCount == 2) {
    ...
  } else {
    ...
  }
}

That is probably better than a switch. The optimizer usually figures out in that construct that one of the two comparisons is redundant so you have two branches based on one comparison.

konsolebox 08-08-2009 08:42 AM

Thanks for figuring that out. I was always focusing on something like 'n |/&/^ m' so i forgot simple things like > or >=.

ntubski 08-08-2009 08:02 PM

If we're still in the micro-optimization game, then I would suggest optimizing for the common case: directories which are not "." or "..". This way for most of the time the code doesn't have to look out for dots, just slash and end of string. EDIT: And by putting a '/' at the end of workBuffer a lot of checks for end of string can be removed. By the way, I uncovered a bug when I tried this out:
Code:

workBufferSize += cwdLength + 1; // +1 is needed for the '/' between cwd and sourceArg


Code:

    // ensure that workBuffer ends in "/", to reduce checks for '\0'
    // exercise for reader: rewrite so realloc isn't needed
    endWithSlash = (sourceArg[sourceArgSize-2] == '/');
    if (!endWithSlash) {
        workBufferSize += 1;
        workBuffer = realloc(workBuffer, workBufferSize);
        workBuffer[workBufferSize-2] = '/';
        workBuffer[workBufferSize-1] = '\0';
    }

    t = 0;

    for (p = workBuffer + 1; *p; ++p) {
        if (p[0] == '.') {
            if (p[1] == '/') {
                // "." means same dir
                p += 1;
                continue;
            } else if (p[1] == '.') {
                if (p[2] == '/') {
                    // ".." means go up one dir
                    if (t > 0) --t;
                    p += 2;
                    continue;
                }
            }
        } else if (p[0] == '/') {
            continue;
        }

        // not "." or ".."
        CHECK_TOKENS(t);
        tokens[t++] = p;
        do {
            ++p;
        } while (p[0] != '/');
        p[0] = '\0';
    }

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    if (endWithSlash) putchar('/');
    else if (t == 0) fputs("/.", stdout);
    putchar('\n');


konsolebox 08-10-2009 02:18 AM

Thanks for the fix. That was a bug indeed. I'll take a look at the new code as well. Later.

Sergei Steshenko 08-10-2009 03:20 AM

Quote:

Originally Posted by konsolebox (Post 3637390)
Thanks for the fix. That was a bug indeed. I'll take a look at the new code as well. Later.

For example, Perl has a bunch of modules for path manipulation.

Your performance bottleneck is not "C", it's your shell script.

konsolebox 08-10-2009 03:53 AM

Quote:

Originally Posted by Sergei Steshenko (Post 3637435)
Your performance bottleneck is not "C", it's your shell script.

I already know that before I posted the code in this thread. I just want to finish the code optimized and leave it like that forever and never be bothered to play around with it again. The program is not only useful for my current script. It can also be used in my other scripts and many other things. I just want to make it fast that's all.

Some folks say that I'm wasting so much time to this code but actually I only created the code in an hour or two and posted it. I also took time making replies to this thread and analyzing the code but it was fun anyway.

I spent most of the time on my scripts and not on this program.

Now regarding my shell script, I think it is already in its most optimized form but I know that I can never be sure with it so I also plan to post it later but the script is part of a project and I still can't find a proper license for it. There's nothing much in the project but I just want to be careful.

Regardless, thanks for your reminders.

Sergei Steshenko 08-10-2009 06:02 AM

Quote:

Originally Posted by konsolebox (Post 3637475)
I already know that before I posted the code in this thread. I just want to finish the code optimized and leave it like that forever and never be bothered to play around with it again. The program is not only useful for my current script. It can also be used in my other scripts and many other things. I just want to make it fast that's all.

Some folks say that I'm wasting so much time to this code but actually I only created the code in an hour or two and posted it. I also took time making replies to this thread and analyzing the code but it was fun anyway.

I spent most of the time on my scripts and not on this program.

Now regarding my shell script, I think it is already in its most optimized form but I know that I can never be sure with it so I also plan to post it later but the script is part of a project and I still can't find a proper license for it. There's nothing much in the project but I just want to be careful.

Regardless, thanks for your reminders.


And why is the script a shell one in the first place ?

konsolebox 08-10-2009 07:15 AM

Quote:

Originally Posted by Sergei Steshenko
And why is the script a shell one in the first place ?

It's a helper script for shell scripts.

Sergei Steshenko 08-10-2009 07:47 AM

Quote:

Originally Posted by konsolebox (Post 3637707)
It's a helper script for shell scripts.

Is it source'd or called ?

konsolebox 08-10-2009 08:22 AM

it's sourced or can be called if it's the main caller.

konsolebox 08-11-2009 01:15 AM

Quote:

Originally Posted by ntubski (Post 3636191)
If we're still in the micro-optimization game

I think I still am. :)
Quote:

Originally Posted by ntubski (Post 3636191)
, then I would suggest optimizing for the common case: directories which are not "." or "..". This way for most of the time the code doesn't have to look out for dots, just slash and end of string. EDIT: And by putting a '/' at the end of workBuffer a lot of checks for end of string can be removed. By the way, I uncovered a bug when I tried this out:
Code:

workBufferSize += cwdLength + 1; // +1 is needed for the '/' between cwd and sourceArg


Code:

    // ensure that workBuffer ends in "/", to reduce checks for '\0'
    // exercise for reader: rewrite so realloc isn't needed
    endWithSlash = (sourceArg[sourceArgSize-2] == '/');
    if (!endWithSlash) {
        workBufferSize += 1;
        workBuffer = realloc(workBuffer, workBufferSize);
        workBuffer[workBufferSize-2] = '/';
        workBuffer[workBufferSize-1] = '\0';
    }

    t = 0;

    for (p = workBuffer + 1; *p; ++p) {
        if (p[0] == '.') {
            if (p[1] == '/') {
                // "." means same dir
                p += 1;
                continue;
            } else if (p[1] == '.') {
                if (p[2] == '/') {
                    // ".." means go up one dir
                    if (t > 0) --t;
                    p += 2;
                    continue;
                }
            }
        } else if (p[0] == '/') {
            continue;
        }

        // not "." or ".."
        CHECK_TOKENS(t);
        tokens[t++] = p;
        do {
            ++p;
        } while (p[0] != '/');
        p[0] = '\0';
    }

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    if (endWithSlash) putchar('/');
    else if (t == 0) fputs("/.", stdout);
    putchar('\n');


I tried to incorporate your changes to the original code but there was something that did not work right. When I tried to do something like:
Code:

./getabspath ../abc
the code will only print out <parent>/a

But anyway the suggestion is helpful and I'm still finding a way how to use it to improve the code although I still can't.

However also, when I was looking at the original code, I found a solution in which the code will no longer need malloc()s. That's by immediately binding the tokens of the cwd to the tokens array. When I made the changes, I also included some of the tweaks that was recommended in this thread. Here's my current code:
Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

void reportMaxTokensReached (void);

int main (int argc, char ** argv) {
        register char * p;

        char * token;
        char * tokens [MAXTOKENS];
        unsigned int t;

        char cwd [CWDBUFFERSIZE];
        unsigned int cwdLength;

        unsigned int dotsCount;
        unsigned int hasNonDots;

        unsigned int i;

        #ifdef CHECKTOKENS
                #define TOKENSPLUS(T) \
                        if (t == MAXTOKENS) { \
                                reportMaxTokensReached(); \
                                return 1; \
                        } \
                        tokens[t++] = T;
        #else
                #define TOKENSPLUS(T) \
                        tokens[t++] = T;
        #endif

        if (argc == 1)        {
                if (getcwd(cwd, CWDBUFFERSIZE)) {
                        puts((char const *) cwd);
                        return 0;
                } else {
                        return 1;
                }
        }

        t = 0;

        if (*argv[1] != '/') {
                if (! getcwd(cwd, CWDBUFFERSIZE))
                        return 1;

                if (cwd[1]) {
                        tokens[0] = p = cwd + 1;
                        t++;

                        while (*++p) {
                                if (*p == '/') {
                                        *p = '\0';
                                        TOKENSPLUS(++p);        // everything after '/' is certainly not '\0'
                                }
                        }
                }
        }

        dotsCount = 0;
        hasNonDots = 0;

        for (p = token = argv[1]; *p; ++p) {
                switch (*p) {
                case '/':
                        if (hasNonDots) {
                                TOKENSPLUS(token);
                        } else if (dotsCount > 1) {
                                if (dotsCount == 2) {
                                        if (t)
                                                --t;
                                } else {
                                        TOKENSPLUS(token);
                                }
                        }

                        dotsCount = 0;
                        hasNonDots = 0;
                        *p = '\0';
                        token = p + 1;
                        continue;
                case '.':
                        ++dotsCount;
                        continue;
                default:
                        ++hasNonDots;
                }
        }

        if (hasNonDots) {
                TOKENSPLUS(token);
        } else if (dotsCount > 1) {
                if (dotsCount == 2) {
                        if (t)
                                --t;
                } else {
                        TOKENSPLUS(token);
                }
        }

        if (p[-1]) {
                // last character is not a forward slash ('/')

                if (t) {
                        for (i = 0; i < t; i++) {
                                putchar('/');
                                fputs(tokens[i], stdout);
                        }

                        puts("");
                } else {
                        puts("/.");
                }
        } else {
                if (t) {
                        for (i = 0; i < t; i++) {
                                putchar('/');
                                fputs(tokens[i], stdout);
                        }
                }

                puts("/");
        }

        return 0;
}

void reportMaxTokensReached (void) {
        printf("path has too many tokens.\n");
}

Cheers.

ntubski 08-11-2009 05:58 PM

Quote:

Originally Posted by konsolebox (Post 3638784)
the code will only print out <parent>/a

Did you fix the bug I pointed out?

Quote:

Code:

        #ifdef CHECKTOKENS
                #define TOKENSPLUS(T) \
                        if (t == MAXTOKENS) { \
                                reportMaxTokensReached(); \
                                return 1; \
                        } \
                        tokens[t++] = T;
        #else
                #define TOKENSPLUS(T) \
                        tokens[t++] = T;
        #endif


Still don't believe in inline functions? :rolleyes:


If you want to avoid malloc'ing and checking the length of argv[1], the trick with the slash doesn't work, but here's the code with the other changes.
Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/"
        : (t > 0)? ""
        : "/.");

    return 0;
}


konsolebox 08-13-2009 04:07 AM

Quote:

Originally Posted by ntubski (Post 3639814)
Did you fix the bug I pointed out?

You mean realloc() or the one I just said?
Quote:

Originally Posted by ntubski (Post 3639814)
Still don't believe in inline functions? :rolleyes:

I'm just not used to it and I was a bit of in a hurry that time. :)
Quote:

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/"
        : (t > 0)? ""
        : "/.");

    return 0;
}


If we really want to use goto statement there are lots of them that's waiting to be included. Lately I've been so curious with the compiler's output that I've been chasing its optimized outputs with my code and now I'm wondering why strcpy() is not expanding as an internal builtin even if I directly use __builtin_strcpy() and set the compilation to include new instructions like sse, sse2, sse3, ssse3, and sse4 and also use the -fhosted option. I don't know why it doesn't expand to a non-call set of instructions while strlen() expands perfectly. If I'll be able to know how to expand this routine to a builtin, it will not only help me with the code but also help me with the optimization of my whole system. It will also be helpful to the future programs that I might make.

Currently, my plan is to install many versions of gcc to my system especially newer ones like 4.4.1 and 4.5 if possible but I need optimize my system first before it.

About the new code, I'll have a copy of it again and look at it later when I have extra time or when I'm studying the compiler.

I haven't forgotten the scripts by the way. The internals of the script is already complete as expected and no longer require any revision but sadly the interface (callable functions and options) did not, so I thought maybe I need to rest for a while and try to finish it again later when I'm ready. Also I already have choices for the license or agreement that I'll use. Probably it's CC's attribution 3.0, CC0 1.0 or simply just public domain. I'll choose after I have a complete review of the three.

konsolebox 08-14-2009 04:23 AM

Quote:

Originally Posted by ntubski (Post 3639814)
Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CWDBUFFERSIZE  512
#define MAXTOKENS      64
#define CHECKTOKENS

#ifdef CHECKTOKENS
static void reportMaxTokensReached (int num_tokens) {
    if (num_tokens >= MAXTOKENS) {
        fprintf(stderr, "path has too many tokens.\n");
        exit(1);
    }
}
#  define CHECK_TOKENS(t) reportMaxTokensReached(t)
#else
#  define CHECK_TOKENS(t)
#endif

int main (int argc, char ** argv) {
    register char *p;
    char const *tokens [MAXTOKENS];
    unsigned int t;

    char cwd [CWDBUFFERSIZE];

    int endWithSlash;

    unsigned int i;

#define TOKENSPLUS(T) CHECK_TOKENS(t); tokens[t++] = T;

    if (argc == 1) {
        if (getcwd(cwd, CWDBUFFERSIZE)) {
            puts((char const *) cwd);
            return 0;
        } else {
            return 1;
        }
    }

    t = 0;

    if (*argv[1] != '/') {
        if (! getcwd(cwd, CWDBUFFERSIZE))
            return 1;

        if (cwd[1]) {
            tokens[0] = p = cwd + 1;
            t++;

            while (*++p) {
                if (*p == '/') {
                    *p = '\0';
                    TOKENSPLUS(++p);// everything after '/' is certainly not '\0'
                }
            }
        }
    }

    endWithSlash = 1;

    for (p = argv[1]; *p; ++p) {
        char const*const token = p;
        if (*p == '.') {
            ++p;
            if (*p == '.') {
                ++p;
                if ((*p == '\0' || *p == '/') && t > 0) {
                    --t;
                }
            }
        }
        if (*p == '\0') {
            endWithSlash = 0;
            break;
        }
        if (*p == '/') continue;

        // not "." or ".."
        TOKENSPLUS(token);
        for (;;) {
            ++p;
            if (*p == '/')  {
                *p = '\0';
                break;
            } else if (*p == '\0') {
                endWithSlash = 0;
                goto done_tokenizing; // break outer loop
            }
        }
    }
 done_tokenizing:

    for (i = 0; i < t; i++) {
        putchar('/');
        fputs(tokens[i], stdout);
    }
    puts(endWithSlash? "/"
        : (t > 0)? ""
        : "/.");

    return 0;
}


I didn't notice the changes immediately. You've made the code so much simpler. What happened to the two variables? Dude you really are a genius. :D. Thank you very much!

ntubski 08-14-2009 02:51 PM

Quote:

Originally Posted by konsolebox (Post 3641595)
You mean realloc() or the one I just said?

I mean the missing + 1 in
Code:

workBufferSize += (cwdLength = strlen(cwd));
I realize I posted a different version of this in my earlier post, oops. :o

Quote:

If we really want to use goto statement there are lots of them that's waiting to be included.
The goto statement is not there for optimization, if that's what you're thinking. C lacks something like Java's labeled break, so I use goto with a comment. I don't think there is a clearer way of breaking out of nested loops.

Quote:

Lately I've been so curious with the compiler's output that I've been chasing its optimized outputs with my code and now I'm wondering why strcpy() is not expanding as an internal builtin even if I directly use __builtin_strcpy() and set the compilation to include new instructions like sse, sse2, sse3, ssse3, and sse4 and also use the -fhosted option. I don't know why it doesn't expand to a non-call set of instructions while strlen() expands perfectly.
According to http://en.wikibooks.org/wiki/GNU_C_C..._Architecture:
Quote:

In case of strcpy(), the compiler checks the size argument and uses the optimized built-in version of strcpy() if the argument is constant.
I'm not very familiar with sse instructions, but don't they work in blocks of 16 bytes at a time? I don't think they would be useful unless the string length is guaranteed to be divisible by 16.

Quote:

What happened to the two variables?
Basically you had a Finite state machine where the state was in dotsCount and hasNonDots variables. I just moved the state into the instruction pointer. Like going from 1.2 Automata based style program to 1.1 Traditional (imperative) program.

konsolebox 08-18-2009 12:10 PM

Quote:

Originally Posted by ntubski (Post 3643582)
The goto statement is not there for optimization, if that's what you're thinking. C lacks something like Java's labeled break, so I use goto with a comment. I don't think there is a clearer way of breaking out of nested loops.

Yes I also thought so as I was also looking for something like 'break 2'.
Quote:

Originally Posted by ntubski (Post 3643582)
According to http://en.wikibooks.org/wiki/GNU_C_C..._Architecture:
I'm not very familiar with sse instructions, but don't they work in blocks of 16 bytes at a time? I don't think they would be useful unless the string length is guaranteed to be divisible by 16.

Not sure if that's really the case since builtins of memcpy() and strlen() work and they don't depend on 16 bytes blocks.
Quote:

Originally Posted by ntubski (Post 3643582)
Basically you had a Finite state machine where the state was in dotsCount and hasNonDots variables. I just moved the state into the instruction pointer. Like going from 1.2 Automata based style program to 1.1 Traditional (imperative) program.

But still you made a big step for optimizing the code :).

konsolebox 08-18-2009 12:38 PM

Final prototype? Sorry I can't help but use gotos here :)

Code:

/* getabspath.c
 *
 * The program gets a path from an argument and prints its absolute form to the
 * standard output with no newline.  If no argument was passed, the program
 * prints the path of the current working directory.
 *
 * The code was created with speed in mind and is best compiled with -Os.
 *
 * Thanks go to the following people in linuxquestions.org:
 *  johnsfine for technical discussions regarding compilers esp. optimizations,
 *  SciYro, Sergei Steshenko, and catkin for comments and suggestions,
 *  wje_lq for referring links to helpful documents and
 *  ntubski for suggestions, simplifications and enhancements
 *
 * Arranged by konsolebox
 * Copyright Free / Public Domain
 * Aug. 17, 2009
 */


/* CONFIGURATIONS */

#define CONFIG_CWDBUFFERSIZE 512
#define CONFIG_MAXTOKENS 64
#define CONFIG_CHECKTOKENS

/* if set, gather all tokens to a single string then print it;
  write() or fwrite() will be used instead of fputs() */

#define CONFIG_ONESHOT

/* use fwrite() instead of write() */

#undef CONFIG_USEFWRITE

/* use fputs() instead of fwrite() or write() in static null-terminated strings */

#undef CONFIG_USEFPUTS

/* use variable length arrays instead of alloca() and malloc() */

#define CONFIG_USEVARLENGTHARRAYS

/* if CONFIG_USEVARLENGTHARRAYS is not set, use alloca() instead of malloc() */

#define CONFIG_USEALLOCA

/* if one of CONFIG_USEVARLENGTHARRAYS and CONFIG_USEALLOCA is set,
  check the size of the buffer before allocating */

#define CONFIG_MAXOUTPUTBUFFERSIZE 1024
#define CONFIG_CHECKOUTPUTBUFFERSIZE

/* use strcpy() instead of memcpy() */

#undef CONFIG_USESTRCPY


/* REFERENCE OF DEFAULTS */

// CONFIG_CWDBUFFERSIZE        512
// CONFIG_MAXTOKENS            64
// CONFIG_CHECKTOKENS          defined
// CONFIG_ONESHOT              defined
// CONFIG_USEFWRITE            undefined
// CONFIG_USEFPUTS              undefined
// CONFIG_USEVARLENGTHARRAYS    defined
// CONFIG_USEALLOCA            defined
// CONFIG_MAXOUTPUTBUFFERSIZE  1024
// CONFIG_CHECKOUTPUTBUFFERSIZE defined
// CONFIG_USESTRCPY            undefined


/* APPLICATION */

#include <unistd.h>  // getcwd(), write()

#if defined(CONFIG_ONESHOT)
#        include <string.h>  // strcpy(), memcpy()
#endif

#if (defined(CONFIG_ONESHOT) && !defined(CONFIG_USEVARLENGTHARRAYS) && !defined(CONFIG_USEALLOCA))
#        include <stdlib.h>  // malloc(), free()
#endif

#if (!defined(CONFIG_ONESHOT) || defined(CONFIG_USEFWRITE) || defined(CONFIG_USEFPUTS))
#        include <stdio.h>  // fputs(), fwrite(), stdout, stderr
#endif

#ifdef CONFIG_USEFWRITE
#        define STDOUT stdout
#        define STDERR stderr
#        define WRITE(F, T, S) fwrite((const void *) T, S, 1, F);
#else
#        define STDOUT 1
#        define STDERR 2
#        define WRITE(F, T, S) write(F, (const void *) T, S);
#endif

#ifdef CONFIG_USEFPUTS
#        define PRINT(T) fputs(T, stdout);
#        define REPORTFAILURE(T) fputs(T, stderr);
#else
#        define PRINT(T) WRITE(STDOUT, T, (sizeof(T) - sizeof((char) '\0')));
#        define REPORTFAILURE(T) WRITE(STDERR, T, (sizeof(T) - sizeof((char) '\0')));
#endif


int main (int argc, char ** argv) {
        register char * p;
        char * tokens [CONFIG_MAXTOKENS];
        unsigned int t;
        char * base;
        char cwd [CONFIG_CWDBUFFERSIZE];
        int endWithSlash;  // register_or_static int ...;

        /* just print the current directory if no argument was passed
      or if argument is empty */

        if (argc < 2 || *(base = argv[1]) == '\0') {
                if (getcwd(cwd, CONFIG_CWDBUFFERSIZE) == 0x00)
                        goto failedToGetCwd;

                register unsigned int l = strlen(cwd);

                if (l == 1) {
                        WRITE(STDOUT, "/.", 2);
                        goto return0;
                }

                WRITE(STDOUT, cwd, l);
                goto return0;
        }

        /* split everything to tokens */

        {

#ifdef CONFIG_CHECKTOKENS
#        define TOKENSPLUS(T) \
                        if (t == CONFIG_MAXTOKENS) \
                                goto failOnTooManyTokens; \
                        tokens[t++] = T;
#else
#        define TOKENSPLUS(T) \
                        tokens[t++] = T;
#endif

                t = 0;
                endWithSlash = 1;

                p = base;
                register char c = *p;

                if (c != '/') {
                        /* path is relative; include the current directory */

                        if (getcwd(cwd, CONFIG_CWDBUFFERSIZE) == 0x00)
                                goto failedToGetCwd;

                        if (cwd[1]) {
                                // push p; push c;

                                tokens[0] = p = &cwd[1];
                                ++t;

                                for (;;) {
                                        c = *++p;

                                        if (c == '\0')
                                                break;

                                        if (c != '/')
                                                continue;

                                        *p = '\0';

                                        /* since everything after '/' is always not '\0' ... */

                                        TOKENSPLUS(++p);

                                        continue;
                                }

                                /* would have been optional if another set of registers
                                  was used above but it's ok */

                                // pop c; pop p;

                                p = base;
                                c = *p;

                                goto tokenLoop_dotCheck;
                        }

                        goto tokenLoop_dotCheck;
                }

        tokenLoop_start:

                c = *++p;

                if (c == '\0')
                        goto tokenLoop_finish;

                if (c == '/')
                        goto tokenLoop_start;

        tokenLoop_dotCheck:

                if (c == '.') {
                        c = *++p;

                        if (c == '/')
                                goto tokenLoop_start;

                        if (c == '\0')
                                goto tokenLoop_finish_noEndSlash;

                        if (c == '.') {
                                c = *++p;

                                if (c == '/') {
                                        if (t)
                                                --t;
                                        goto tokenLoop_start;
                                }

                                if (c == '\0') {
                                        if (t)
                                                --t;
                                        goto tokenLoop_finish_noEndSlash;
                                }

                                TOKENSPLUS(&p[-2]);

                                goto tokenLoop_skipPlus;
                        }

                        TOKENSPLUS(&p[-1]);

                        goto tokenLoop_skipPlus;
                }

                TOKENSPLUS(p);

        tokenLoop_skipPlus:

                do {
                        c = *++p;

                        if (c == '/') {
                                *p = '\0';
                                goto tokenLoop_start;
                        }
                } while (c);

        tokenLoop_finish_noEndSlash:

                endWithSlash = 0;

        tokenLoop_finish:
                ;
        }

        /* send to stdout */

        if (t) {
#if defined(CONFIG_ONESHOT)
                /* recombine all the tokens to an output buffer and print it */

#        if defined(CONFIG_USEVARLENGTHARRAYS)
                unsigned int tokens_length [t];

#        elif defined(CONFIG_USEALLOCA)
                unsigned int * tokens_length = alloca(t);

#        else
                unsigned int tokens_length [CONFIG_MAXTOKENS];

#        endif

                unsigned int i = 0;
                unsigned int outputBufferSize = 0;

                do {
                        register unsigned int l = strlen(tokens[i]);
                        tokens_length[i] = l;
                        outputBufferSize += l;
                } while (++i < t);

#        ifdef CONFIG_USESTRCPY
                outputBufferSize += t + 2;
                  // t * sizeof((char) '/') [+ sizeof((char) '/')] + sizeof((char) '/0')
#        else
                outputBufferSize += t + 1;
                  // t * sizeof((char) '/') [+ sizeof((char) '/'])
#        endif

#if (defined(CONFIG_CHECKOUTPUTBUFFERSIZE) && (defined(CONFIG_USEVARLENGTHARRAYS) || defined(CONFIG_USEALLOCA)))
                if (outputBufferSize > CONFIG_MAXOUTPUTBUFFERSIZE)
                        goto failOnLargeOutputBufferSize;
#endif

#        if defined(CONFIG_USEVARLENGTHARRAYS)
                char outputBuffer[outputBufferSize];

#        elif defined(CONFIG_USEALLOCA)
                char * outputBuffer = alloca(outputBufferSize);

#        else
#                define FLAG_USINGMALLOC

                char * outputBuffer;

                if ((outputBuffer = (char *) malloc(outputBufferSize)) == 0x00)
                        goto failedToMallocOutputBuffer;
#        endif

                i = 0;
                p = outputBuffer;

                do {
                        *p++ = '/';
                        register unsigned int l = tokens_length[i];

#        ifdef CONFIG_USESTRCPY
                        strcpy(p, tokens[i]);
#        else
                        memcpy(p, tokens[i], l);
#        endif

                        p += l;
                } while (++i < t);

                if (endWithSlash) {
                        *p++ = '/';
                }

                WRITE(STDOUT, outputBuffer, (p - outputBuffer));

#        ifdef FLAG_USINGMALLOC
                free(outputBuffer);
#        endif

#else
                unsigned int i = 0;

                do {
                        p = tokens[i];

                        if (p == base) {
                                fputs("/", stdout);
                                fputs(p, stdout);
                                continue;
                        }

                        --p;
                        *p = '/';
                        fputs(p, stdout);
                } while (++i < t);

                if (endWithSlash == 0)
                        goto return0;

                fputs("/", stdout);
#endif

                goto return0;
        }

        if (endWithSlash) {
                PRINT("/");
                goto return0;
        }

        PRINT("/.");
        goto return0;

failedToGetCwd:
        REPORTFAILURE("failed to get current path.\n");
        goto return1;

failOnTooManyTokens:
        REPORTFAILURE("argument expands to too many tokens.\n");
#ifdef CONFIG_CHECKOUTPUTBUFFERSIZE
        goto return1;

failOnLargeOutputBufferSize:
        REPORTFAILURE("argument expands to too many tokens.\n");
#endif
#ifdef FLAG_USINGMALLOC
        goto return1;

failedToMallocOutputBuffer:
        REPORTFAILURE("unable to allocate space for output buffer\n");
#endif

return1:
        return 1;

return0:
        return 0;
}

I can't really use inlines as they are not supported by default in compilers. It's better this way and it's still the same anyway so it's ok.

Thanks for the help everyone. Finally. Now this is what I meant :).

ntubski 08-18-2009 06:19 PM

Quote:

Not sure if that's really the case since builtins of memcpy() and strlen() work and they don't depend on 16 bytes blocks.
My point was not that they can't be inlined unless 16 byte blocks are used, my point was that the inlined code probably won't make use of sse instructions since it has no guarantees that the data is in 16 byte blocks. And when I lookat the code generated by the __builtin_strlen and __builtin_memcpy they don't use sse instructions. I also noticed that __builtin_memcpy wasn't expanded unless I passed a constant for the length argument.

Quote:

* The code was created with speed in mind and is best compiled with -Os.
But -Os prefers small size over speed... :scratch:


Quote:

Sorry I can't help but use gotos here
I think you've made things needlessly obfuscated, and I'm skeptical as to whether there are any speed benefits from this. Since the program finishes so quickly there's no way to perform useful benchmarks anyways.

Quote:

Now this is what I meant :).
:( This code just makes me sad. :cry:

konsolebox 08-18-2009 11:55 PM

Quote:

Originally Posted by ntubski (Post 3648663)
My point was not that they can't be inlined unless 16 byte blocks are used, my point was that the inlined code probably won't make use of sse instructions since it has no guarantees that the data is in 16 byte blocks. And when I lookat the code generated by the __builtin_strlen and __builtin_memcpy they don't use sse instructions. I also noticed that __builtin_memcpy wasn't expanded unless I passed a constant for the length argument.

I saw it expand with an instruction like REPZ? Is it not a SSE instruction?
Quote:

Originally Posted by ntubski (Post 3648663)
But -Os prefers small size over speed... :scratch:

The gotos are there so that there will be the same output as -O2 even though -O2 is not used. With this, the code will no longer be dependent to optimizations. I would have still preferred -O2 but there is a problem with it. I think it should no longer repeat same sequences of return statements as it is not really critical with speed; it will only make the code bigger. -Os here in this program I think will mean more speed and not lesser size.

Quote:

I think you've made things needlessly obfuscated, and I'm skeptical as to whether there are any speed benefits from this. Since the program finishes so quickly there's no way to perform useful benchmarks anyways.

:( This code just makes me sad. :cry:
I hope you reconsider this with my explanation above. I just want to make it *explicitly* fast as much as possible and not depend with compilers. If it's wrong, I'll try revising the code again.

ntubski 08-20-2009 01:49 PM

Quote:

Originally Posted by konsolebox (Post 3648899)
I saw it expand with an instruction like REPZ? Is it not a SSE instruction?

nope.

Quote:

I just want to make it *explicitly* fast as much as possible and not depend with compilers.
You're solving a problem you don't have: the entire program takes close to 0 seconds to run, these "optimizations" can therefore at most save roughly 0 seconds of runtime, so it's obviously a bad tradeoff. Furthermore, since you can't measure accurately it's even possible that you have created a slower program.

konsolebox 08-21-2009 03:42 AM

Quote:

Originally Posted by ntubski (Post 3651254)
nope.

But how come it's available in -march=i686 and -march=pentium4 and not in generic? Regardless, memcpy and strlen still expands.
Quote:

Furthermore, since you can't measure accurately it's even possible that you have created a slower program.
Like in what part of the code was not accurate? We can always make it better if that's the only problem.
Quote:

You're solving a problem you don't have: the entire program takes close to 0 seconds to run, these "optimizations" can therefore at most save roughly 0 seconds of runtime, so it's obviously a bad tradeoff.
I think I'm no longer being reasonable. Perhaps. But we really can't be sure if the optimization of the program is not going to give any benefit since the program will be called many times in my script.

But regardless of that, I don't think that there's something really wrong that's going to happen if I don't revoke what it was that I have decided. I still think that it's just better to finish what it was that I started with what I have decided.

ntubski 08-21-2009 05:41 PM

Quote:

But how come it's available in -march=i686 and -march=pentium4 and not in generic? Regardless, memcpy and strlen still expands.
I guess various instructions have different efficiency depending on the cpu.
Code:

$ cat builtins.c
#include <stdio.h>

char str[80], str2[80];
int len;

int main()
{
    fgets(str, sizeof str, stdin);

    len = __builtin_strlen(str);
    __builtin_memcpy(str, str2, len);
    return 0;
}
$ gcc -g -march=pentium4 builtins.c -o builtins-pentium4 -O3
$ gcc -g -march=i386 builtins.c -o builtins-386 -O3
$ gcc -g -march=i686 builtins.c -o builtins-686 -O3
$ objdump --prefix-addresses -S builtins-pentium4 | less
...
    len = __builtin_strlen(str);
08048453 <main+0x2f> mov    $0x80496a0,%edi
08048458 <main+0x34> xor    %eax,%eax
0804845a <main+0x36> mov    $0xffffffff,%ecx
0804845f <main+0x3b> repnz scas %es:(%edi),%al
08048461 <main+0x3d> not    %ecx
08048463 <main+0x3f> sub    $0x1,%ecx
08048466 <main+0x42> mov    %ecx,0x8049680
    __builtin_memcpy(str, str2, len);
0804846c <main+0x48> mov    %ecx,0x8(%esp)
08048470 <main+0x4c> movl  $0x8049700,0x4(%esp)
08048478 <main+0x54> movl  $0x80496a0,(%esp)
0804847f <main+0x5b> call  08048358 <memcpy@plt>
...
$ objdump --prefix-addresses -S builtins-386 | less
...
    len = __builtin_strlen(str);
08048419 <main+0x25> mov    $0x8049660,%edx
0804841e <main+0x2a> mov    %edx,%edi
08048420 <main+0x2c> xor    %eax,%eax
08048422 <main+0x2e> mov    $0xffffffff,%ecx
08048427 <main+0x33> repnz scas %es:(%edi),%al
08048429 <main+0x35> not    %ecx
0804842b <main+0x37> dec    %ecx
0804842c <main+0x38> mov    %ecx,0x8049640
    __builtin_memcpy(str, str2, len);
08048432 <main+0x3e> mov    $0x80496c0,%esi
08048437 <main+0x43> mov    %edx,%edi
08048439 <main+0x45> rep movsb %ds:(%esi),%es:(%edi)
...
$ objdump --prefix-addresses -S builtins-686 | less
...
    len = __builtin_strlen(str);
0804845e <main+0x2e> mov    $0x80496e0,%ecx
08048463 <main+0x33> mov    (%ecx),%eax
08048465 <main+0x35> add    $0x4,%ecx
08048468 <main+0x38> lea    -0x1010101(%eax),%edx
0804846e <main+0x3e> not    %eax
08048470 <main+0x40> and    %eax,%edx
08048472 <main+0x42> and    $0x80808080,%edx
08048478 <main+0x48> je    08048463 <main+0x33>
0804847a <main+0x4a> mov    %edx,%eax
0804847c <main+0x4c> shr    $0x10,%eax
0804847f <main+0x4f> test  $0x8080,%edx
08048485 <main+0x55> cmove  %eax,%edx
08048488 <main+0x58> lea    0x2(%ecx),%eax
0804848b <main+0x5b> cmove  %eax,%ecx
0804848e <main+0x5e> add    %dl,%dl
08048490 <main+0x60> sbb    $0x3,%ecx
08048493 <main+0x63> sub    $0x80496e0,%ecx
08048499 <main+0x69> mov    %ecx,0x80496c0
    __builtin_memcpy(str, str2, len);
0804849f <main+0x6f> mov    %ecx,0x8(%esp)
080484a3 <main+0x73> movl  $0x8049740,0x4(%esp)
080484ab <main+0x7b> movl  $0x80496e0,(%esp)
080484b2 <main+0x82> call  08048358 <memcpy@plt>
...


konsolebox 08-29-2009 10:00 AM

Quote:

Originally Posted by ntubski (Post 3631636)
Why don't you ask for help optimizing the shell script...

Just made the first upload of the scripts. You might want to take a look at it here: http://sourceforge.net/projects/loader/.

ntubski 08-29-2009 12:38 PM

Whoa :eek:

There's a lot of code there, I sincerely doubt that replacing the abspath functions will make a noticeable difference, but it's hard to say without some typical input files to test things with. If you are serious about optimizing I would go back to my earlier suggestion: rewrite the program to handle multiple paths per run, perhaps using a fifo or socket.

I'm wondering why you have a hashing function in your compiler, aren't all awk arrays hash tables already?

konsolebox 08-30-2009 08:32 AM

Quote:

Originally Posted by ntubski (Post 3662135)
Whoa :eek:

Took me really much time to finish this. :)
Quote:

There's a lot of code there, I sincerely doubt that replacing the abspath functions will make a noticeable difference, but it's hard to say without some typical input files to test things with.
That's probably the reason why I really can't have much for a choice. :)
I have test scripts but I didn't include it in the package. It's not really necessary to include them I guess.
Quote:

If you are serious about optimizing I would go back to my earlier suggestion: rewrite the program to handle multiple paths per run, perhaps using a fifo or socket.
Probably but I'm not sure if that's really applicable since no part of the code requires more than 1 path at a time.
Quote:

I'm wondering why you have a hashing function in your compiler, aren't all awk arrays hash tables already?
Do you mean makehash()? I just use it to create hashes like the ones created by md5sum and the likes. I use it to create functions that will represent call files.

By the way I finally finished the Simpler version as well. Loader-Simpler includes the extended functions loadx(), includex() and callx(). They acknowledge patterns and expressions.

The last will be the full and non-stripped version but it's the most difficult and surely it will not be soon.. though.

konsolebox 08-31-2009 04:26 AM

Quote:

Originally Posted by ntubski (Post 3662135)
If you are serious about optimizing I would go back to my earlier suggestion: rewrite the program to handle multiple paths per run, perhaps using a fifo or socket.

I think I know now what you mean but I think it would be better if I put that on a different package since it appears to be more like a hack and I'm not sure if it will be stable as a generic implementation for shells.

ntubski 08-31-2009 05:40 PM

Quote:

Originally Posted by konsolebox (Post 3662791)
Quote:

There's a lot of code there, I sincerely doubt that replacing the abspath functions will make a noticeable difference, but it's hard to say without some typical input files to test things with.
That's probably the reason why I really can't have much for a choice. :)

Um, I can't parse this sentence. :confused:
Quote:

I have test scripts but I didn't include it in the package. It's not really necessary to include them I guess.
But you have some example that runs slow (which is why you asked for optimization help), right? It would be interesting to see that.

konsolebox 09-01-2009 04:56 AM

Quote:

Originally Posted by ntubski (Post 3664583)
Um, I can't parse this sentence. :confused:

never mind that then.. sorry if my english is bad :D
Quote:

But you have some example that runs slow (which is why you asked for optimization help), right? It would be interesting to see that.
they're pretty unorderly but ok.. i'll post it later.

konsolebox 09-02-2009 03:28 AM

Quote:

Originally Posted by ntubski (Post 3664583)
But you have some example that runs slow (which is why you asked for optimization help), right? It would be interesting to see that.

About the scripts? Not really. I'm pretty happy with what they are right now and they appear to be already stable and consistent. After many revisions, I finally found their final form (probably the most optimized and most balanced form). And nothing has changed much since the first release (only descriptions and documents except for the generic loader where I didn't test its failure function).

But of course and nevertheless, I can never be sure of that and suggestions like yours (esp. from someone that helped me a lot already) will always be welcomed.

Just made the cleaner version of the test scripts and uploaded them. Same site at sourceforge in the extra folder. Good luck and enjoy :D.


All times are GMT -5. The time now is 08:58 AM.