More simple C and color mixing tests
Posted 06-11-2013 at 12:30 AM by rainbowsally
Today's Feature
Back a couple of blog pages
http://www.linuxquestions.org/questi...lamping-35557/
we were playing with color mixing and data clamping and all kinds of junk including comparing routines to mix colors by a floating point ratio.
This is mixcolors.c - named the same as the other version we were playing with and including the same code for comparison purposes.
This one adds 'mixRgb() which uses 8 bit fixed point math and does two pixels at a time. A quarter of the machine code is building the stack frame and messing with the fpu. The code is predictably about half the size of the other two systems we've been comparing.
Although branching wouldn't cost much relative to the size of the code, we have eliminated branching by clamping by other means. GCC is a knockout! It knows how to do it at the assembler level.
Enable the timing loop to see how many nanoseconds it takes your computer to compute a new pixel from two others and a ratio between 0.0 and 1.0. Don't forget the -O2 switch if you're timing it.
file: src/mixcolors.c
purpose: source file
- Here's another pixel (color) mixing toy. The best yet.
Back a couple of blog pages
http://www.linuxquestions.org/questi...lamping-35557/
we were playing with color mixing and data clamping and all kinds of junk including comparing routines to mix colors by a floating point ratio.
This is mixcolors.c - named the same as the other version we were playing with and including the same code for comparison purposes.
This one adds 'mixRgb() which uses 8 bit fixed point math and does two pixels at a time. A quarter of the machine code is building the stack frame and messing with the fpu. The code is predictably about half the size of the other two systems we've been comparing.
Although branching wouldn't cost much relative to the size of the code, we have eliminated branching by clamping by other means. GCC is a knockout! It knows how to do it at the assembler level.
Enable the timing loop to see how many nanoseconds it takes your computer to compute a new pixel from two others and a ratio between 0.0 and 1.0. Don't forget the -O2 switch if you're timing it.
file: src/mixcolors.c
purpose: source file
Code:
// mixcolors.c - another test of pixel mixing. This one used 8 bit // fixed point and does two pixels at a time. A quarter of the machine // code is building the stack frame and messing with the fpu. The code // is predictably about half the size of the other two systems we've been // comparing. Although branching wouldn't cost much relative to the size // of the code, we have eliminated branching that could be involved for // clamping ins and outs to within reasonable limits. /** Two pairs of pixels are mixed at a time. Total 58 opcodes Stack, floating point and return = 18 Byte wide (8-bit) math for 4 pixels = 40 lines, clamped, no branching. About 10 bytes per color (and alpha). 08048800 <mixRgba>: s 80488c0: 55 push %ebp s 80488c1: 89 e5 mov %esp,%ebp s 80488c3: 83 ec 14 sub $0x14,%esp f 80488c6: d9 7d f2 fnstcw -0xe(%ebp) f 80488c9: d9 05 a8 8c 04 08 flds 0x8048ca8 f 80488cf: dc 4d 10 fmull 0x10(%ebp) f 80488d2: 89 5d f4 mov %ebx,-0xc(%ebp) f 80488d5: 8b 5d 08 mov 0x8(%ebp),%ebx f 80488d8: 0f b7 45 f2 movzwl -0xe(%ebp),%eax s 80488dc: 89 75 f8 mov %esi,-0x8(%ebp) 80488df: 8b 75 0c mov 0xc(%ebp),%esi s 80488e2: 89 7d fc mov %edi,-0x4(%ebp) 80488e5: 89 d9 mov %ebx,%ecx f 80488e7: b4 0c mov $0xc,%ah f 80488e9: 66 89 45 f0 mov %ax,-0x10(%ebp) f 80488ed: d9 6d f0 fldcw -0x10(%ebp) f 80488f0: db 5d ec fistpl -0x14(%ebp) f 80488f3: d9 6d f2 fldcw -0xe(%ebp) 80488f6: 8b 45 ec mov -0x14(%ebp),%eax 80488f9: 89 c2 mov %eax,%edx 80488fb: c1 fa 1f sar $0x1f,%edx 80488fe: f7 d2 not %edx 8048900: 21 c2 and %eax,%edx 8048902: 31 c0 xor %eax,%eax 8048904: 81 ea 00 01 00 00 sub $0x100,%edx 804890a: 85 d2 test %edx,%edx 804890c: 0f 9f c0 setg %al 804890f: 81 e1 00 ff 00 ff and $0xff00ff00,%ecx 8048915: 83 e8 01 sub $0x1,%eax 8048918: 81 e3 ff 00 ff 00 and $0xff00ff,%ebx 804891e: 21 d0 and %edx,%eax 8048920: 89 f2 mov %esi,%edx 8048922: 8d b8 00 01 00 00 lea 0x100(%eax),%edi 8048928: 81 e2 ff 00 ff 00 and $0xff00ff,%edx 804892e: f7 d8 neg %eax 8048930: 0f af d7 imul %edi,%edx 8048933: c1 e9 08 shr $0x8,%ecx 8048936: 0f af c8 imul %eax,%ecx 8048939: 0f af c3 imul %ebx,%eax 804893c: 8b 5d f4 mov -0xc(%ebp),%ebx 804893f: 81 e2 00 ff 00 ff and $0xff00ff00,%edx 8048945: c1 ea 08 shr $0x8,%edx 8048948: 81 e1 00 ff 00 ff and $0xff00ff00,%ecx 804894e: 01 ca add %ecx,%edx 8048950: 89 f1 mov %esi,%ecx 8048952: 8b 75 f8 mov -0x8(%ebp),%esi 8048955: 81 e1 00 ff 00 ff and $0xff00ff00,%ecx 804895b: 25 00 ff 00 ff and $0xff00ff00,%eax 8048960: c1 e9 08 shr $0x8,%ecx 8048963: 0f af cf imul %edi,%ecx 8048966: 8b 7d fc mov -0x4(%ebp),%edi 8048969: 89 ec mov %ebp,%esp 804896b: c1 e8 08 shr $0x8,%eax s 804896e: 5d pop %ebp 804896f: 81 e1 00 ff 00 ff and $0xff00ff00,%ecx 8048975: 01 ca add %ecx,%edx 8048977: 8d 04 02 lea (%edx,%eax,1),%eax r 804897a: c3 ret mixColorsI (fixed point) is 130 bytes long mixColorsF (floating point) is 159 */ #include <stdio.h> #include <malloc.h> #include <string.h> #include <stdlib.h> void dbg(){} // for a non-moving breakpoint uint mixColorsF(uint xnum1, uint xnum2, double frac); uint mixColorsI(uint xnum1, uint xnum2, double frac); uint mixRgbalong(uint pixel1, uint pixel2, double frac); uint mixRgbashort(uint pixel1, uint pixel2, double frac); int main(int argc, char** argv) { dbg(); if(argc != 4) { printf( "\n" "Experment with clamped color mixing (float and fixed point).\n\n" "Usage: mixcolors AARRGGBB AARRGGBB Frac\n" " Where AARRGGBB are two ARGB format pixels and Frac.\n" " is the distance along the gradient from the first to\n" " the second.\n\n" ); return 0; } uint xnum1 = 0, xnum2 = 0; double frac = 0; sscanf(argv[1], "%x", &xnum1); sscanf(argv[2], "%x", &xnum2); sscanf(argv[3], "%lf", &frac); #if 0 // testing nanoseconds run 'time mixcolors' uint res; { int i, j,h; for(h = 0; h < 1000; h++) { for(i = 0; i < 1000; i++) { for(j = 0; j < 1000; j++) res |= mixRgbashort(xnum1, xnum2, frac); } } } printf("%x\n", res); #endif printf("%0x\n", mixColorsI(xnum1, xnum2, frac)); printf("%0x\n", mixColorsF(xnum1, xnum2, frac)); printf("%0x\n", mixRgbalong(xnum1, xnum2, frac)); printf("%0x\n", mixRgbashort(xnum1, xnum2, frac)); return 0; } static int opaque = 0; uint mixRgbalong(uint pixel1, uint pixel2, double frac) // , bool opaque /* true */) { int a, b, c; uint tmp1, tmp2, res1, res2, res; void** xa=(void**)&a; void** xb=(void**)&b; void** xtmp1=(void**)&tmp1; void** xtmp2=(void**)&tmp2; void** xres1=(void**)&res1; void** xres2=(void**)&res2; xa=xa, xb=xb, xtmp1=xtmp1, xtmp2=xtmp2, xres1=xres1, xres2=xres2; // 8 bit fixed point multipliers b = 256 * frac; // these compare to 0 things where 0 and N are the possible returns run fast b = (b < 0 ? 0 : b); // and add and sub are faster than a branch. c = b - 256; c = (c > 0) ? 0 : c; b = c + 256; a = 256 - b; // lo bits: mask, multiply, mask, shift tmp1 = pixel1; tmp1 = tmp1 & 0x00FF00FF; tmp1 = tmp1 * a; tmp1 = tmp1 & 0xFF00FF00; res1 = tmp1 >> 8; tmp2 = pixel2; tmp2 = tmp2 & 0x00FF00FF; tmp2 = tmp2 * b; tmp2 = tmp2 & 0xFF00FF00; res1 += tmp2 >> 8; // hi bits: mask, shift, multiply, mask tmp1 = pixel1; tmp1 = tmp1 & 0xFF00FF00; tmp1 = tmp1 >> 8; tmp1 = tmp1 * a; res2 = tmp1 & 0xFF00FF00; tmp2 = pixel2; tmp2 = tmp2 & 0xFF00FF00; tmp2 = tmp2 >> 8; tmp2 = tmp2 * b; res2 += tmp2 & 0xFF00FF00; res = (res1 + res2) | ((-opaque) & 0xFF000000); // set alpha = 255 if opaque return res; } uint mixRgbashort(uint pixel1, uint pixel2, double frac) // , bool opaque /* true */) { int a, b, c; uint res1, res2; // 8 bit fixed point multipliers b = 256 * frac; // these compare to 0 things where 0 and N are the possible returns run fast b = (b < 0 ? 0 : b); // and add and sub are faster than a branch. c = b - 256; c = (c > 0) ? 0 : c; b = c + 256; a = 256 - b; // lo bits: mask, multiply, mask, shift res1 = (((pixel1 & 0x00FF00FF) * a) & 0xFF00FF00) >> 8; res1 += ((((pixel2 & 0x00FF00FF) * b) & 0xFF00FF00)) >> 8; // hi bits: mask, shift, multiply, mask res2 = (((pixel1 & 0xFF00FF00) >> 8) * a) & 0xFF00FF00; res2 += (((pixel2 & 0xFF00FF00) >> 8) * b) & 0xFF00FF00; return (res1 + res2) | ((-opaque) & 0xFF000000); // set alpha = 255 if opaque } // This is the usual implementation and it's the biggest. 160 bytes with -O2 uint mixColorsF(uint xnum1, uint xnum2, double frac) { uint r1, g1, b1, a1; uint r2, g2, b2, a2; uint res; double frac1, frac2; if(frac == 0) return xnum1; if (frac == 1.0) return xnum2; frac1 = (1 - frac); // * 1 << 10; frac2 = frac; // * 1 << 10; b1 = xnum1 & 0xff; g1 = (xnum1 >> 8) & 0xff; r1 = (xnum1 >> 16) & 0xff; a1 = (xnum1 >> 24) & 0xff; b2 = xnum2 & 0xff; g2 = (xnum2 >> 8) & 0xff; r2 = (xnum2 >> 16) & 0xff; a2 = (xnum2 >> 24) & 0xff; b1 = (b1 * (frac1)) + (b2 * frac2); g1 = (g1 * (frac1)) + (g2 * frac2); r1 = (r1 * (frac1)) + (r2 * frac2); a1 = (a1 * (frac1)) + (a2 * frac2); // clamp to 255 max b1 |= 255 * (b1 > 255); g1 |= 255 * (g1 > 255); r1 |= 255 * (r1 > 255); a1 |= 255 * (a1 > 255); res = (b1 & 0xff) | (g1 & 0xff) << 8 | (r1 & 0xff) << 16 | (a1 & 0xff) << 24; return res; } #define FIXPT 10 // This one is faster and shorter but misses the boat by two bits. ;-) Why not // 8 bit fixed point? After all the result will be within 9 bits itself. uint mixColorsI(uint xnum1, uint xnum2, double frac) { uint r1, g1, b1, a1; uint r2, g2, b2, a2; uint res; uint frac1, frac2; if(frac == 0) return xnum1; if (frac == 1.0) return xnum2; frac1 = (1 - frac) * (1 << FIXPT); frac2 = frac * (1 << FIXPT); b1 = xnum1 & 0xff; g1 = (xnum1 >> 8) & 0xff; r1 = (xnum1 >> 16) & 0xff; a1 = (xnum1 >> 24) & 0xff; b2 = xnum2 & 0xff; g2 = (xnum2 >> 8) & 0xff; r2 = (xnum2 >> 16) & 0xff; a2 = (xnum2 >> 24) & 0xff; b1 = ((b1 * (frac1)) + (b2 * frac2)) >> FIXPT; g1 = ((g1 * (frac1)) + (g2 * frac2)) >> FIXPT; r1 = ((r1 * (frac1)) + (r2 * frac2)) >> FIXPT; a1 = ((a1 * (frac1)) + (a2 * frac2)) >> FIXPT; // clamp to 255 max b1 |= 255 * (b1 > 255); g1 |= 255 * (g1 > 255); r1 |= 255 * (r1 > 255); a1 |= 255 * (a1 > 255); res = (b1 & 0xff) | (g1 & 0xff) << 8 | (r1 & 0xff) << 16 | (a1 & 0xff) << 24; return res; } // The fastest and shortest code is for the mixRgb code. The test runs all the // versions and gets the same results.
Total Comments 0