LinuxQuestions.org - Fastest MEMMEM!?

- Programming (https://www.linuxquestions.org/questions/programming-9/)

- - Fastest MEMMEM!? (https://www.linuxquestions.org/questions/programming-9/fastest-memmem-4175481626/)

Fastest MEMMEM!?

Hi *nix guys,
here I want to ask for help in finding/sharing the fastest search function when a block is needed to be found in another block, since I am a Windows (don't ask) user and have little experience in *nix console, yes the MEMMEM which is missing in C libraries under Windows (who knows why).

Simply put, do you know a better function than mine, 'Railgun_Sekireigan_Bari', I wrote an article at:
http://www.codeproject.com/Articles/...-function-in-C

The source is this:

Code:

// Version: This is Railgun_Sekireigan revision 2, copyleft 2013-Oct-16, Kaze

// Purpose: This is optimized strstr-like (memmem in fact) function for short needles up to 255 bytes ... and beyond.

// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.

#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))

#define HaystackThresholdSekirei 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.

#define NeedleThresholdBIGSekirei 12+40 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).

#define HashTableSizeSekirei 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.

char * Railgun_Sekireigan_Bari (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)

{

        char * pbTargetMax = pbTarget + cbTarget;

        register uint32_t ulHashPattern;

        register uint32_t ulHashTarget;

        signed long count;

        signed long countSTATIC;



        unsigned char SINGLET;

        uint32_t Quadruplet2nd;

        uint32_t Quadruplet3rd;

        uint32_t Quadruplet4th;



        uint32_t AdvanceHopperGrass;



        long i; //BMH needed

        int a, j;

        unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.

        uint32_t Gulliver; // or unsigned char or unsigned short



        unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekirei-3)];

        uint32_t hash32;

        uint32_t hash32B;

        uint32_t hash32C;



        if (cbPattern > cbTarget) return(NULL);



        if ( cbPattern<4 ) { 



                pbTarget = pbTarget+cbPattern;

                ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));

                if ( cbPattern==3 ) {

                        for ( ;; ) {

                                if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {

                                        if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));

                                }

                                if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) { 

                                        pbTarget++;

                                        if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;

                                }

                                pbTarget++;

                                if (pbTarget > pbTargetMax) return(NULL);

                        }

                } else {

                }

                for ( ;; ) {

                        if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));

                        if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;

                        pbTarget++;

                        if (pbTarget > pbTargetMax) return(NULL);

                }



        } else {

                if (cbTarget<HaystackThresholdSekirei) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.



                pbTarget = pbTarget+cbPattern;

                ulHashPattern = *(uint32_t *)(pbPattern);

                SINGLET = ulHashPattern & 0xFF;

                Quadruplet2nd = SINGLET<<8;

                Quadruplet3rd = SINGLET<<16;

                Quadruplet4th = SINGLET<<24;

                for ( ;; ) {

                        AdvanceHopperGrass = 0;

                        ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);

                        if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.

                                count = cbPattern-1;

                                while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {

                                        if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;

                                        count--;

                                }

                                if ( count == 0) return((pbTarget-cbPattern));

                        } else { // The goal here: to avoid memory accesses by stressing the registers.

                                if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {

                                        AdvanceHopperGrass++;

                                        if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {

                                                AdvanceHopperGrass++;

                                                if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;

                                        }

                                }

                        }

                        AdvanceHopperGrass++;

                        pbTarget = pbTarget + AdvanceHopperGrass;

                        if (pbTarget > pbTargetMax) return(NULL);

                }



                } else { //if (cbTarget<HaystackThresholdSekirei)



        if ( cbPattern<=NeedleThresholdBIGSekirei ) { 



                        countSTATIC = cbPattern-2-2;

                        ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes

                        //ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes

                        i=0;

                        //for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized

                        //for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed

                        for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}

                        for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=1;

                        while (i <= cbTarget-cbPattern) {

                                Gulliver = 1;

                                if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {

                                        //if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP

                                                if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {

                                                        count = countSTATIC; // Last two chars already matched, to be fixed with -2

                                                        while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )

                                                                count--;

                                                        if ( count == 0) return(pbTarget+i);

                                                }

                                        //}

                                        // Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added. 

                                        if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;

                                } else Gulliver = cbPattern-(2-1);

                                i = i + Gulliver;

                                //GlobalI++; // Comment it, it is only for stats.

                        }

                        return(NULL);



        } else { // if ( cbPattern<=NeedleThresholdBIGSekirei )

// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [

                        countSTATIC = cbPattern-2-2;



                        ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes

                        //ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes

                

                        i=0;



                        for (a=0; a < 1<<(HashTableSizeSekirei-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized

                        // cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'

                        for (j=0; j < cbPattern-12+1; j++) {

                                hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;        

                                hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;        

                                hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;        

                                hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;

                                hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;

                                hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );

                                bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));

                        }



                        while (i <= cbTarget-cbPattern) {

                                Gulliver = 1; // Assume minimal jump as initial value.

                                // The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24

                                //GlobalHashSectionExecution++; // Comment it, it is only for stats.

                                        hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;        

                                        hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;        

                                        hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;        

                                        hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;

                                        hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;

                                        hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );

                                        if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )

                                                Gulliver = cbPattern-(12-1);



                                if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP

                                        if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {

                                                count = countSTATIC;

                                                while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )

                                                        count--;

                                                if ( count == 0) return(pbTarget+i);

                                        }

                                }

                                        i = i + Gulliver;

                                //GlobalI++; // Comment it, it is only for stats.

                        } // while (i <= cbTarget-cbPattern)

                        return(NULL);

// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]

        } // if ( cbPattern<=NeedleThresholdBIGSekirei )

                } //if (cbTarget<HaystackThresholdSekirei)

        } //if ( cbPattern<4 )

}

My wish is with help of more experienced coders 'Railgun' to become *nix gun of choice.
Also I will be pleased if someone benchmark it against (current MEMMEMs), I have no opportunity, neither modern computer nor knowledge of *nix environment.

Lucky I am to come up with a new even faster function - 'Railgun_Sekireigan_Piri'.

The graph showing the superiority of 'Piri':
http://www.linuxquestions.org/questi...1&d=1383501798

The fastest memmem(), known to me, comes here.

The source/benchmark is downloadable at my INTERNET drive:
https://1drv.ms/u/s!AmWWFXGMzDmEglwjlUtnMJrfhosK

It also is zipped (the 4 testfiles are not) and attached to the post.

The speed showdown has three facets:
- compares the 64bit code generated from GCC 5.10 versus Intel 15.0 compilers;
- compares two datasets - search speed through English texts versus Genome ACGT-type data;
- compares the tweaked Two-Way algorithm (implemented by Eric Blake) and adopted by GLIBC as memmem() versus my Railgun_Swampshine.

Note1: The GLIBC memmem() was taken from latest (2016-08-05) glibc 2.24 tar:
https://www.gnu.org/software/libc/
Note2: Eric Blake says that he enhanced the linearity of Two-Way by adding some sublinear paths, well, Railgun is all about sublinearity, so feel free to experiment with your own testfiles (worst-case-scenarios), just make such a file feed the compressor with it, then we will see how the LINEAR Two-Way behaves versus Railgun_Swampshine.
Note3: Just copy-and-paste 'Railgun_Swampshine' or 'Railgun_Ennearch' from the benchmark's source.

So the result on Core 2 Q9550s @2.83GHz:

Code:

---------------------------------------------------------------------------------------------

| testfile\Searcher                        | GNU/GLIBC memmem()    | Railgun_Swampshine    | 

|-------------------------------------------------------------------------------------------|

| Compiler                                  | Intel 15.0 | GCC 5.10 | Intel 15.0 | GCC 5.10 |

|-------------------------------------------------------------------------------------------|

| The_Project_Gutenberg_EBook_of_Don        |        190 |      226 |      1654 |    1729 |

| _Quixote_996_(ANSI).txt                  |            |          |            |          |

| 2,347,772 bytes                          |            |          |            |          |

|-------------------------------------------------------------------------------------------|

| The_Project_Gutenberg_EBook_of_Dokoe      |        582 |      760 |      3094 |    2898 |

| _by_Hakucho_Masamune_(Japanese_UTF-8).txt |            |          |            |          |

| 899,425 bytes                            |            |          |            |          |

|-------------------------------------------------------------------------------------------|

| Dragonfly_genome_shotgun_sequence        |        104 |      109 |        445 |      458 |

| _(ACGT_alphabet).fasta                    |            |          |            |          |

| 4,487,433 bytes                          |            |          |            |          |

|-------------------------------------------------------------------------------------------|

| LAOTZU_Wu_Wei_(BINARY).pdf                |        99 |      144 |        629 |      580 |

| 954,035 bytes                            |            |          |            |          |

|-------------------------------------------------------------------------------------------|

Note: The numbers represent the rate (bytes/s) at which patterns/needles 4,5,6,8,9,10,12,13,14,16,17,18,24 bytes long are memmemed into 4KB, 256KB, 1MB, 256MB long haystacks.
in fact, these numbers are the compression speed using LZSS and memmem() as matchfinder.

Two-Way is significantly slower than BMH Order 2, the speed-down is in range:
- for TEXTUAL ANSI alphabets: 1729/226= 7.6x
- for TEXTUAL UTF8 alphabets: 2898/760= 3.8x
- for TEXTUAL ACGT alphabets: 458/109= 4.2x
- for BINARY-esque alphabets: 580/144= 4.0x

For faster RAM, than mine @666MHz, and for haystacks multimegabytes long, the speedup goes beyond 8x.

The benchmark shows the real behavior (both latency and raw speed) of the memmem variants, I added also the Thierry Lecroq's Two-Way implementation:
http://www-igm.univ-mlv.fr/~lecroq/s...l#SECTION00260
However, Eric Blake's one is faster, so it was chosen for the speed showdown.

Once I measured the total length of traversed haystacks, and for files 100+MB long, it went ... quintillion of bytes i.e. petabytes - good torture it is.

Enfun!

The fastest memmem() is already the tuned Railgun_Swampshine_BailOut - avoiding second pattern comparison in BMH2 and pseudo-BMH4. It is called 'Trolldom', to signify its secondary purpose - to traverse fast via Trolllandia a.k.a. ugly/deformed haystacks causing quadratic behavior of most Boyer-Moore-Horspool variants. GNU/GLIBC memmem() is linearIC and sublinearIC at times while Railgun_Trolldom is mostly sublinearIC.

Railgun_Trolldom is slightly faster (both with Intel & GCC) than Railgun_Swampshine, this is mostly due to adding a bitwise BMH order 2 (8KB table overhead instead of 64KB) path - for haystacks <77777 bytes long - the warm-up is faster.

Code:

So the result on Core 2 Q9550s @2.83GHz DDR2 @666MHz / i5-2430M @3.00GHz DDR3 @666MHz:

--------------------------------------------------------------------------------------------------------------------------------

| Searcher                                  | GNU/GLIBC memmem()        | Railgun_Swampshine      | Railgun_Trolldom          | 

|--------------------------------------------------------------------------------------------------|---------------------------|

| Testfile\Compiler                        | Intel 15.0 | GCC 5.10    | Intel 15.0 | GCC 5.10    | Intel 15.0  | GCC 5.10    |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 27,703 bytes                        |    4506/- |  5330/14725 |    13198/- | 11581/15171 | 19105/22449 | 15493/21642 |

| Name: An_Interview_with_Carlos_Castaneda.TXT                          |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 308,062              |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 3,242,492,648      |            |            |            |            |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 2,347,772 bytes                    |      190/- |      226/244 |    1654/- |  1729/1806 |  1794/1822 |  1743/1809 |

| Name: Gutenberg_EBook_Don_Quixote_996_(ANSI).txt                      |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 14,316,954            |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 6,663,594,719,173  |            |            |            |            |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 899,425 bytes                      |      582/- |      760/816 |    3094/- |  2898/3088 |  3255/3289 |  2915/3322 |

| Name: Gutenberg_EBook_Dokoe_by_Hakucho_Masamune_(Japanese_UTF8).txt  |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 3,465,806            |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 848,276,034,315    |            |            |            |            |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 4,487,433 bytes                    |      104/- |      109/116 |      445/- |    458/417 |    450/411 |    467/425 |

| Name: Dragonfly_genome_shotgun_sequence_(ACGT_alphabet).fasta        |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 20,540,375            |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 13,592,530,857,131  |            |            |            |            |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 954,035 bytes                      |      99/- |      144/144 |      629/- |    580/682 |    634/807 |    585/725 |

| Name: LAOTZU_Wu_Wei_(BINARY).pdf                                      |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 27,594,933            |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 8,702,455,122,519  |            |            |            |            |

|------------------------------------------------------------------------------------------------------------------------------|

| Size: 15,583,440 bytes                    |        -/- |          -/- |        -/- |    663/771 |    675/778 |    663/757 |

| Name: Arabian_Nights_complete.html                                    |            |            |            |            |

| LATENCY-WISE: Number of 'memmem()' Invocations: 72,313,262            |            |            |            |            |

| THROUGHPUT-WISE: Number of Total bytes Traversed: 105,631,163,854,099 |            |            |            |            |

--------------------------------------------------------------------------------------------------------------------------------

The benchmark is downloadable at my Internet drive and my site:

https://1drv.ms/u/s!AmWWFXGMzDmEgl3izdwhSnIq0Lv5
www.sanmayce.com/Downloads/Nakamichi_Kintaro++_source_executables_64bit_(GCC510-vs-Intel150)_(TW-vs-RG)_BENCHMARK.zip

Since it is pure C, no intrinsics used, perhaps it will work unproblematically on most *nix systems.

To the attached archive I included the C source along with its listing in 28 pages in PDF format - to be printed and digested more easily from search techniques aficionados. ENFUN!