LinuxQuestions.org

LinuxQuestions.org (/questions/)
-   Programming (https://www.linuxquestions.org/questions/programming-9/)
-   -   Fastest MEMMEM!? (https://www.linuxquestions.org/questions/programming-9/fastest-memmem-4175481626/)

Sanmayce 10-21-2013 12:41 PM

Fastest MEMMEM!?
 
Hi *nix guys,
here I want to ask for help in finding/sharing the fastest search function when a block is needed to be found in another block, since I am a Windows (don't ask) user and have little experience in *nix console, yes the MEMMEM which is missing in C libraries under Windows (who knows why).

Simply put, do you know a better function than mine, 'Railgun_Sekireigan_Bari', I wrote an article at:
http://www.codeproject.com/Articles/...-function-in-C

The source is this:
Code:

// Version: This is Railgun_Sekireigan revision 2, copyleft 2013-Oct-16, Kaze
// Purpose: This is optimized strstr-like (memmem in fact) function for short needles up to 255 bytes ... and beyond.
// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.
#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HaystackThresholdSekirei 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.
#define NeedleThresholdBIGSekirei 12+40 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).
#define HashTableSizeSekirei 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.
char * Railgun_Sekireigan_Bari (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
        char * pbTargetMax = pbTarget + cbTarget;
        register uint32_t ulHashPattern;
        register uint32_t ulHashTarget;
        signed long count;
        signed long countSTATIC;

        unsigned char SINGLET;
        uint32_t Quadruplet2nd;
        uint32_t Quadruplet3rd;
        uint32_t Quadruplet4th;

        uint32_t AdvanceHopperGrass;

        long i; //BMH needed
        int a, j;
        unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
        uint32_t Gulliver; // or unsigned char or unsigned short

        unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekirei-3)];
        uint32_t hash32;
        uint32_t hash32B;
        uint32_t hash32C;

        if (cbPattern > cbTarget) return(NULL);

        if ( cbPattern<4 ) {

                pbTarget = pbTarget+cbPattern;
                ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
                if ( cbPattern==3 ) {
                        for ( ;; ) {
                                if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
                                        if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
                                }
                                if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) {
                                        pbTarget++;
                                        if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
                                }
                                pbTarget++;
                                if (pbTarget > pbTargetMax) return(NULL);
                        }
                } else {
                }
                for ( ;; ) {
                        if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
                        if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
                        pbTarget++;
                        if (pbTarget > pbTargetMax) return(NULL);
                }

        } else {
                if (cbTarget<HaystackThresholdSekirei) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.

                pbTarget = pbTarget+cbPattern;
                ulHashPattern = *(uint32_t *)(pbPattern);
                SINGLET = ulHashPattern & 0xFF;
                Quadruplet2nd = SINGLET<<8;
                Quadruplet3rd = SINGLET<<16;
                Quadruplet4th = SINGLET<<24;
                for ( ;; ) {
                        AdvanceHopperGrass = 0;
                        ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);
                        if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
                                count = cbPattern-1;
                                while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
                                        if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
                                        count--;
                                }
                                if ( count == 0) return((pbTarget-cbPattern));
                        } else { // The goal here: to avoid memory accesses by stressing the registers.
                                if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
                                        AdvanceHopperGrass++;
                                        if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
                                                AdvanceHopperGrass++;
                                                if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
                                        }
                                }
                        }
                        AdvanceHopperGrass++;
                        pbTarget = pbTarget + AdvanceHopperGrass;
                        if (pbTarget > pbTargetMax) return(NULL);
                }

                } else { //if (cbTarget<HaystackThresholdSekirei)

        if ( cbPattern<=NeedleThresholdBIGSekirei ) {

                        countSTATIC = cbPattern-2-2;
                        ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
                        //ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
                        i=0;
                        //for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
                        //for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
                        for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
                        for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=1;
                        while (i <= cbTarget-cbPattern) {
                                Gulliver = 1;
                                if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
                                        //if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
                                                if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
                                                        count = countSTATIC; // Last two chars already matched, to be fixed with -2
                                                        while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
                                                                count--;
                                                        if ( count == 0) return(pbTarget+i);
                                                }
                                        //}
                                        // Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added.
                                        if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
                                } else Gulliver = cbPattern-(2-1);
                                i = i + Gulliver;
                                //GlobalI++; // Comment it, it is only for stats.
                        }
                        return(NULL);

        } else { // if ( cbPattern<=NeedleThresholdBIGSekirei )
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [
                        countSTATIC = cbPattern-2-2;

                        ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
                        //ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
               
                        i=0;

                        for (a=0; a < 1<<(HashTableSizeSekirei-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized
                        // cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
                        for (j=0; j < cbPattern-12+1; j++) {
                                hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;       
                                hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;       
                                hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;       
                                hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
                                hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
                                hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );
                                bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));
                        }

                        while (i <= cbTarget-cbPattern) {
                                Gulliver = 1; // Assume minimal jump as initial value.
                                // The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
                                //GlobalHashSectionExecution++; // Comment it, it is only for stats.
                                        hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;       
                                        hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;       
                                        hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;       
                                        hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
                                        hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
                                        hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );
                                        if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )
                                                Gulliver = cbPattern-(12-1);

                                if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP
                                        if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
                                                count = countSTATIC;
                                                while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
                                                        count--;
                                                if ( count == 0) return(pbTarget+i);
                                        }
                                }
                                        i = i + Gulliver;
                                //GlobalI++; // Comment it, it is only for stats.
                        } // while (i <= cbTarget-cbPattern)
                        return(NULL);
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]
        } // if ( cbPattern<=NeedleThresholdBIGSekirei )
                } //if (cbTarget<HaystackThresholdSekirei)
        } //if ( cbPattern<4 )
}

My wish is with help of more experienced coders 'Railgun' to become *nix gun of choice.
Also I will be pleased if someone benchmark it against (current MEMMEMs), I have no opportunity, neither modern computer nor knowledge of *nix environment.

Sanmayce 11-03-2013 12:08 PM

1 Attachment(s)
Lucky I am to come up with a new even faster function - 'Railgun_Sekireigan_Piri'.

The graph showing the superiority of 'Piri':
http://www.linuxquestions.org/questi...1&d=1383501798

Sanmayce 08-13-2016 05:02 AM

1 Attachment(s)
The fastest memmem(), known to me, comes here.

The source/benchmark is downloadable at my INTERNET drive:
https://1drv.ms/u/s!AmWWFXGMzDmEglwjlUtnMJrfhosK

It also is zipped (the 4 testfiles are not) and attached to the post.

The speed showdown has three facets:
- compares the 64bit code generated from GCC 5.10 versus Intel 15.0 compilers;
- compares two datasets - search speed through English texts versus Genome ACGT-type data;
- compares the tweaked Two-Way algorithm (implemented by Eric Blake) and adopted by GLIBC as memmem() versus my Railgun_Swampshine.

Note1: The GLIBC memmem() was taken from latest (2016-08-05) glibc 2.24 tar:
https://www.gnu.org/software/libc/
Note2: Eric Blake says that he enhanced the linearity of Two-Way by adding some sublinear paths, well, Railgun is all about sublinearity, so feel free to experiment with your own testfiles (worst-case-scenarios), just make such a file feed the compressor with it, then we will see how the LINEAR Two-Way behaves versus Railgun_Swampshine.
Note3: Just copy-and-paste 'Railgun_Swampshine' or 'Railgun_Ennearch' from the benchmark's source.

So the result on Core 2 Q9550s @2.83GHz:

Code:

---------------------------------------------------------------------------------------------
| testfile\Searcher                        | GNU/GLIBC memmem()    | Railgun_Swampshine    |
|-------------------------------------------------------------------------------------------|
| Compiler                                  | Intel 15.0 | GCC 5.10 | Intel 15.0 | GCC 5.10 |
|-------------------------------------------------------------------------------------------|
| The_Project_Gutenberg_EBook_of_Don        |        190 |      226 |      1654 |    1729 |
| _Quixote_996_(ANSI).txt                  |            |          |            |          |
| 2,347,772 bytes                          |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| The_Project_Gutenberg_EBook_of_Dokoe      |        582 |      760 |      3094 |    2898 |
| _by_Hakucho_Masamune_(Japanese_UTF-8).txt |            |          |            |          |
| 899,425 bytes                            |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| Dragonfly_genome_shotgun_sequence        |        104 |      109 |        445 |      458 |
| _(ACGT_alphabet).fasta                    |            |          |            |          |
| 4,487,433 bytes                          |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| LAOTZU_Wu_Wei_(BINARY).pdf                |        99 |      144 |        629 |      580 |
| 954,035 bytes                            |            |          |            |          |
|-------------------------------------------------------------------------------------------|

Note: The numbers represent the rate (bytes/s) at which patterns/needles 4,5,6,8,9,10,12,13,14,16,17,18,24 bytes long are memmemed into 4KB, 256KB, 1MB, 256MB long haystacks.
in fact, these numbers are the compression speed using LZSS and memmem() as matchfinder.

Two-Way is significantly slower than BMH Order 2, the speed-down is in range:
- for TEXTUAL ANSI alphabets: 1729/226= 7.6x
- for TEXTUAL UTF8 alphabets: 2898/760= 3.8x
- for TEXTUAL ACGT alphabets: 458/109= 4.2x
- for BINARY-esque alphabets: 580/144= 4.0x

For faster RAM, than mine @666MHz, and for haystacks multimegabytes long, the speedup goes beyond 8x.

The benchmark shows the real behavior (both latency and raw speed) of the memmem variants, I added also the Thierry Lecroq's Two-Way implementation:
http://www-igm.univ-mlv.fr/~lecroq/s...l#SECTION00260
However, Eric Blake's one is faster, so it was chosen for the speed showdown.

Once I measured the total length of traversed haystacks, and for files 100+MB long, it went ... quintillion of bytes i.e. petabytes - good torture it is.

Enfun!

Sanmayce 08-22-2016 11:22 AM

1 Attachment(s)
The fastest memmem() is already the tuned Railgun_Swampshine_BailOut - avoiding second pattern comparison in BMH2 and pseudo-BMH4. It is called 'Trolldom', to signify its secondary purpose - to traverse fast via Trolllandia a.k.a. ugly/deformed haystacks causing quadratic behavior of most Boyer-Moore-Horspool variants. GNU/GLIBC memmem() is linearIC and sublinearIC at times while Railgun_Trolldom is mostly sublinearIC.

Railgun_Trolldom is slightly faster (both with Intel & GCC) than Railgun_Swampshine, this is mostly due to adding a bitwise BMH order 2 (8KB table overhead instead of 64KB) path - for haystacks <77777 bytes long - the warm-up is faster.

Code:

So the result on Core 2 Q9550s @2.83GHz DDR2 @666MHz / i5-2430M @3.00GHz DDR3 @666MHz:
--------------------------------------------------------------------------------------------------------------------------------
| Searcher                                  | GNU/GLIBC memmem()        | Railgun_Swampshine      | Railgun_Trolldom          |
|--------------------------------------------------------------------------------------------------|---------------------------|
| Testfile\Compiler                        | Intel 15.0 | GCC 5.10    | Intel 15.0 | GCC 5.10    | Intel 15.0  | GCC 5.10    |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 27,703 bytes                        |    4506/- |  5330/14725 |    13198/- | 11581/15171 | 19105/22449 | 15493/21642 |
| Name: An_Interview_with_Carlos_Castaneda.TXT                          |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 308,062              |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 3,242,492,648      |            |            |            |            |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 2,347,772 bytes                    |      190/- |      226/244 |    1654/- |  1729/1806 |  1794/1822 |  1743/1809 |
| Name: Gutenberg_EBook_Don_Quixote_996_(ANSI).txt                      |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 14,316,954            |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 6,663,594,719,173  |            |            |            |            |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 899,425 bytes                      |      582/- |      760/816 |    3094/- |  2898/3088 |  3255/3289 |  2915/3322 |
| Name: Gutenberg_EBook_Dokoe_by_Hakucho_Masamune_(Japanese_UTF8).txt  |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 3,465,806            |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 848,276,034,315    |            |            |            |            |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 4,487,433 bytes                    |      104/- |      109/116 |      445/- |    458/417 |    450/411 |    467/425 |
| Name: Dragonfly_genome_shotgun_sequence_(ACGT_alphabet).fasta        |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 20,540,375            |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 13,592,530,857,131  |            |            |            |            |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 954,035 bytes                      |      99/- |      144/144 |      629/- |    580/682 |    634/807 |    585/725 |
| Name: LAOTZU_Wu_Wei_(BINARY).pdf                                      |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 27,594,933            |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 8,702,455,122,519  |            |            |            |            |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 15,583,440 bytes                    |        -/- |          -/- |        -/- |    663/771 |    675/778 |    663/757 |
| Name: Arabian_Nights_complete.html                                    |            |            |            |            |
| LATENCY-WISE: Number of 'memmem()' Invocations: 72,313,262            |            |            |            |            |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 105,631,163,854,099 |            |            |            |            |
--------------------------------------------------------------------------------------------------------------------------------

The benchmark is downloadable at my Internet drive and my site:

https://1drv.ms/u/s!AmWWFXGMzDmEgl3izdwhSnIq0Lv5
www.sanmayce.com/Downloads/Nakamichi_Kintaro++_source_executables_64bit_(GCC510-vs-Intel150)_(TW-vs-RG)_BENCHMARK.zip

Since it is pure C, no intrinsics used, perhaps it will work unproblematically on most *nix systems.

To the attached archive I included the C source along with its listing in 28 pages in PDF format - to be printed and digested more easily from search techniques aficionados. ENFUN!


All times are GMT -5. The time now is 06:46 AM.