Fastest MEMMEM!?

Sanmayce · 10-21-2013, 12:41 PM

Hi *nix guys,
here I want to ask for help in finding/sharing the fastest search function when a block is needed to be found in another block, since I am a Windows (don't ask) user and have little experience in *nix console, yes the MEMMEM which is missing in C libraries under Windows (who knows why).

Simply put, do you know a better function than mine, 'Railgun_Sekireigan_Bari', I wrote an article at:
http://www.codeproject.com/Articles/...-function-in-C

The source is this:

Code:

// Version: This is Railgun_Sekireigan revision 2, copyleft 2013-Oct-16, Kaze
// Purpose: This is optimized strstr-like (memmem in fact) function for short needles up to 255 bytes ... and beyond.
// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.
#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HaystackThresholdSekirei 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.
#define NeedleThresholdBIGSekirei 12+40 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).
#define HashTableSizeSekirei 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.
char * Railgun_Sekireigan_Bari (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
	char * pbTargetMax = pbTarget + cbTarget;
	register uint32_t ulHashPattern;
	register uint32_t ulHashTarget;
	signed long count;
	signed long countSTATIC;

	unsigned char SINGLET;
	uint32_t Quadruplet2nd;
	uint32_t Quadruplet3rd;
	uint32_t Quadruplet4th;

	uint32_t AdvanceHopperGrass;

	long i; //BMH needed
	int a, j;
	unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
	uint32_t Gulliver; // or unsigned char or unsigned short

	unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekirei-3)];
	uint32_t hash32;
	uint32_t hash32B;
	uint32_t hash32C;

	if (cbPattern > cbTarget) return(NULL);

	if ( cbPattern<4 ) { 

        	pbTarget = pbTarget+cbPattern;
		ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
		if ( cbPattern==3 ) {
			for ( ;; ) {
				if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
					if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
				}
				if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) { 
					pbTarget++;
					if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
				}
				pbTarget++;
				if (pbTarget > pbTargetMax) return(NULL);
			}
		} else {
		}
		for ( ;; ) {
			if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
			if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
			pbTarget++;
			if (pbTarget > pbTargetMax) return(NULL);
		}

	} else {
		if (cbTarget<HaystackThresholdSekirei) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.

		pbTarget = pbTarget+cbPattern;
		ulHashPattern = *(uint32_t *)(pbPattern);
		SINGLET = ulHashPattern & 0xFF;
		Quadruplet2nd = SINGLET<<8;
		Quadruplet3rd = SINGLET<<16;
		Quadruplet4th = SINGLET<<24;
		for ( ;; ) {
			AdvanceHopperGrass = 0;
			ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);
			if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
				count = cbPattern-1;
				while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
					if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
					count--;
				}
				if ( count == 0) return((pbTarget-cbPattern));
			} else { // The goal here: to avoid memory accesses by stressing the registers.
				if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
					AdvanceHopperGrass++;
					if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
						AdvanceHopperGrass++;
						if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
					}
				}
			}
			AdvanceHopperGrass++;
			pbTarget = pbTarget + AdvanceHopperGrass;
			if (pbTarget > pbTargetMax) return(NULL);
		}

		} else { //if (cbTarget<HaystackThresholdSekirei)

	if ( cbPattern<=NeedleThresholdBIGSekirei ) { 

			countSTATIC = cbPattern-2-2;
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
			i=0;
			//for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
			//for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
			for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
			for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=1;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1;
				if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
					//if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
							count = countSTATIC; // Last two chars already matched, to be fixed with -2
							while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
								count--;
							if ( count == 0) return(pbTarget+i);
						}
					//}
					// Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added. 
					if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
				} else Gulliver = cbPattern-(2-1);
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);

	} else { // if ( cbPattern<=NeedleThresholdBIGSekirei )
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [
			countSTATIC = cbPattern-2-2;

			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
		
			i=0;

			for (a=0; a < 1<<(HashTableSizeSekirei-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized
			// cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
			for (j=0; j < cbPattern-12+1; j++) {
				hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;        
				hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;        
				hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;        
				hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
				hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
				hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );
				bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));
			}

			while (i <= cbTarget-cbPattern) {
				Gulliver = 1; // Assume minimal jump as initial value.
				// The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
				//GlobalHashSectionExecution++; // Comment it, it is only for stats.
					hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;        
					hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;        
					hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;        
					hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
					hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
					hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekirei))-1 );
					if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )
						Gulliver = cbPattern-(12-1);

				if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP
					if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
						count = countSTATIC;
						while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
							count--;
						if ( count == 0) return(pbTarget+i);
					}
				}
					i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			} // while (i <= cbTarget-cbPattern)
			return(NULL);
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]
	} // if ( cbPattern<=NeedleThresholdBIGSekirei )
		} //if (cbTarget<HaystackThresholdSekirei)
	} //if ( cbPattern<4 )
}

My wish is with help of more experienced coders 'Railgun' to become *nix gun of choice.
Also I will be pleased if someone benchmark it against (current MEMMEMs), I have no opportunity, neither modern computer nor knowledge of *nix environment.

Sanmayce · 11-03-2013, 12:08 PM

Lucky I am to come up with a new even faster function - 'Railgun_Sekireigan_Piri'.

The graph showing the superiority of 'Piri':
http://www.linuxquestions.org/questi...1&d=1383501798

Sanmayce · 08-13-2016, 05:02 AM

The fastest memmem(), known to me, comes here.

The source/benchmark is downloadable at my INTERNET drive:
https://1drv.ms/u/s!AmWWFXGMzDmEglwjlUtnMJrfhosK

It also is zipped (the 4 testfiles are not) and attached to the post.

The speed showdown has three facets:
- compares the 64bit code generated from GCC 5.10 versus Intel 15.0 compilers;
- compares two datasets - search speed through English texts versus Genome ACGT-type data;
- compares the tweaked Two-Way algorithm (implemented by Eric Blake) and adopted by GLIBC as memmem() versus my Railgun_Swampshine.

Note1: The GLIBC memmem() was taken from latest (2016-08-05) glibc 2.24 tar:
https://www.gnu.org/software/libc/
Note2: Eric Blake says that he enhanced the linearity of Two-Way by adding some sublinear paths, well, Railgun is all about sublinearity, so feel free to experiment with your own testfiles (worst-case-scenarios), just make such a file feed the compressor with it, then we will see how the LINEAR Two-Way behaves versus Railgun_Swampshine.
Note3: Just copy-and-paste 'Railgun_Swampshine' or 'Railgun_Ennearch' from the benchmark's source.

So the result on Core 2 Q9550s @2.83GHz:

Code:

---------------------------------------------------------------------------------------------
| testfile\Searcher                         | GNU/GLIBC memmem()    | Railgun_Swampshine    | 
|-------------------------------------------------------------------------------------------|
| Compiler                                  | Intel 15.0 | GCC 5.10 | Intel 15.0 | GCC 5.10 |
|-------------------------------------------------------------------------------------------|
| The_Project_Gutenberg_EBook_of_Don        |        190 |      226 |       1654 |     1729 |
| _Quixote_996_(ANSI).txt                   |            |          |            |          |
| 2,347,772 bytes                           |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| The_Project_Gutenberg_EBook_of_Dokoe      |        582 |      760 |       3094 |     2898 |
| _by_Hakucho_Masamune_(Japanese_UTF-8).txt |            |          |            |          |
| 899,425 bytes                             |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| Dragonfly_genome_shotgun_sequence         |        104 |      109 |        445 |      458 |
| _(ACGT_alphabet).fasta                    |            |          |            |          |
| 4,487,433 bytes                           |            |          |            |          |
|-------------------------------------------------------------------------------------------|
| LAOTZU_Wu_Wei_(BINARY).pdf                |         99 |      144 |        629 |      580 |
| 954,035 bytes                             |            |          |            |          |
|-------------------------------------------------------------------------------------------|

Note: The numbers represent the rate (bytes/s) at which patterns/needles 4,5,6,8,9,10,12,13,14,16,17,18,24 bytes long are memmemed into 4KB, 256KB, 1MB, 256MB long haystacks.
in fact, these numbers are the compression speed using LZSS and memmem() as matchfinder.

Two-Way is significantly slower than BMH Order 2, the speed-down is in range:
- for TEXTUAL ANSI alphabets: 1729/226= 7.6x
- for TEXTUAL UTF8 alphabets: 2898/760= 3.8x
- for TEXTUAL ACGT alphabets: 458/109= 4.2x
- for BINARY-esque alphabets: 580/144= 4.0x

For faster RAM, than mine @666MHz, and for haystacks multimegabytes long, the speedup goes beyond 8x.

The benchmark shows the real behavior (both latency and raw speed) of the memmem variants, I added also the Thierry Lecroq's Two-Way implementation:
http://www-igm.univ-mlv.fr/~lecroq/s...l#SECTION00260
However, Eric Blake's one is faster, so it was chosen for the speed showdown.

Once I measured the total length of traversed haystacks, and for files 100+MB long, it went ... quintillion of bytes i.e. petabytes - good torture it is.

Enfun!

Sanmayce · 08-22-2016, 11:22 AM

The fastest memmem() is already the tuned Railgun_Swampshine_BailOut - avoiding second pattern comparison in BMH2 and pseudo-BMH4. It is called 'Trolldom', to signify its secondary purpose - to traverse fast via Trolllandia a.k.a. ugly/deformed haystacks causing quadratic behavior of most Boyer-Moore-Horspool variants. GNU/GLIBC memmem() is linearIC and sublinearIC at times while Railgun_Trolldom is mostly sublinearIC.

Railgun_Trolldom is slightly faster (both with Intel & GCC) than Railgun_Swampshine, this is mostly due to adding a bitwise BMH order 2 (8KB table overhead instead of 64KB) path - for haystacks <77777 bytes long - the warm-up is faster.

Code:

So the result on Core 2 Q9550s @2.83GHz DDR2 @666MHz / i5-2430M @3.00GHz DDR3 @666MHz:
--------------------------------------------------------------------------------------------------------------------------------
| Searcher                                  | GNU/GLIBC memmem()        | Railgun_Swampshine       | Railgun_Trolldom          | 
|--------------------------------------------------------------------------------------------------|---------------------------|
| Testfile\Compiler                         | Intel 15.0 | GCC 5.10     | Intel 15.0 | GCC 5.10    | Intel 15.0  | GCC 5.10    |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 27,703 bytes                        |     4506/- |   5330/14725 |    13198/- | 11581/15171 | 19105/22449 | 15493/21642 |
| Name: An_Interview_with_Carlos_Castaneda.TXT                          |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 308,062               |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 3,242,492,648       |            |             |             |             |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 2,347,772 bytes                     |      190/- |      226/244 |     1654/- |   1729/1806 |   1794/1822 |   1743/1809 |
| Name: Gutenberg_EBook_Don_Quixote_996_(ANSI).txt                      |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 14,316,954            |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 6,663,594,719,173   |            |             |             |             |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 899,425 bytes                       |      582/- |      760/816 |     3094/- |   2898/3088 |   3255/3289 |   2915/3322 |
| Name: Gutenberg_EBook_Dokoe_by_Hakucho_Masamune_(Japanese_UTF8).txt   |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 3,465,806             |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 848,276,034,315     |            |             |             |             |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 4,487,433 bytes                     |      104/- |      109/116 |      445/- |     458/417 |     450/411 |     467/425 |
| Name: Dragonfly_genome_shotgun_sequence_(ACGT_alphabet).fasta         |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 20,540,375            |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 13,592,530,857,131  |            |             |             |             |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 954,035 bytes                       |       99/- |      144/144 |      629/- |     580/682 |     634/807 |     585/725 |
| Name: LAOTZU_Wu_Wei_(BINARY).pdf                                      |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 27,594,933            |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 8,702,455,122,519   |            |             |             |             |
|------------------------------------------------------------------------------------------------------------------------------|
| Size: 15,583,440 bytes                    |        -/- |          -/- |        -/- |     663/771 |     675/778 |     663/757 |
| Name: Arabian_Nights_complete.html                                    |            |             |             |             |
| LATENCY-WISE: Number of 'memmem()' Invocations: 72,313,262            |            |             |             |             |
| THROUGHPUT-WISE: Number of Total bytes Traversed: 105,631,163,854,099 |            |             |             |             |
--------------------------------------------------------------------------------------------------------------------------------

The benchmark is downloadable at my Internet drive and my site:

https://1drv.ms/u/s!AmWWFXGMzDmEgl3izdwhSnIq0Lv5
www.sanmayce.com/Downloads/Nakamichi_Kintaro++_source_executables_64bit_(GCC510-vs-Intel150)_(TW-vs-RG)_BENCHMARK.zip

Since it is pure C, no intrinsics used, perhaps it will work unproblematically on most *nix systems.

To the attached archive I included the C source along with its listing in 28 pages in PDF format - to be printed and digested more easily from search techniques aficionados. ENFUN!