More simple C and color mixing tests

Posted 06-11-2013 at 12:30 AM by rainbowsally

Tags assembler, computer mad science, simple c

Today's Feature

Here's another pixel (color) mixing toy. The best yet.

Back a couple of blog pages
http://www.linuxquestions.org/questi...lamping-35557/
we were playing with color mixing and data clamping and all kinds of junk including comparing routines to mix colors by a floating point ratio.

This is mixcolors.c - named the same as the other version we were playing with and including the same code for comparison purposes.

This one adds 'mixRgb() which uses 8 bit fixed point math and does two pixels at a time. A quarter of the machine code is building the stack frame and messing with the fpu. The code is predictably about half the size of the other two systems we've been comparing.

Although branching wouldn't cost much relative to the size of the code, we have eliminated branching by clamping by other means. GCC is a knockout! It knows how to do it at the assembler level.

Enable the timing loop to see how many nanoseconds it takes your computer to compute a new pixel from two others and a ratio between 0.0 and 1.0. Don't forget the -O2 switch if you're timing it.

file: src/mixcolors.c
purpose: source file

Code:

// mixcolors.c - another test of pixel mixing.  This one used 8 bit 
// fixed point and does two pixels at a time.  A quarter of the machine 
// code is building the stack frame and messing with the fpu.  The code
// is predictably about half the size of the other two systems we've been
// comparing.  Although branching wouldn't cost much relative to the size
// of the code, we have eliminated branching that could be involved for 
// clamping ins and outs to within reasonable limits.

/**

Two pairs of pixels are mixed at a time.

Total 58 opcodes
Stack, floating point and return = 18
Byte wide (8-bit) math for 4 pixels = 40 lines, clamped, no branching.
About 10 bytes per color (and alpha).

08048800 <mixRgba>:
s 80488c0: 55                    push   %ebp
s 80488c1: 89 e5                 mov    %esp,%ebp
s 80488c3: 83 ec 14              sub    $0x14,%esp
f 80488c6: d9 7d f2              fnstcw -0xe(%ebp)
f 80488c9: d9 05 a8 8c 04 08     flds   0x8048ca8
f 80488cf: dc 4d 10              fmull  0x10(%ebp)
f 80488d2: 89 5d f4              mov    %ebx,-0xc(%ebp)
f 80488d5: 8b 5d 08              mov    0x8(%ebp),%ebx
f 80488d8: 0f b7 45 f2           movzwl -0xe(%ebp),%eax
s 80488dc: 89 75 f8              mov    %esi,-0x8(%ebp)
  80488df: 8b 75 0c              mov    0xc(%ebp),%esi
s 80488e2: 89 7d fc              mov    %edi,-0x4(%ebp)
  80488e5: 89 d9                 mov    %ebx,%ecx
f 80488e7: b4 0c                 mov    $0xc,%ah
f 80488e9: 66 89 45 f0           mov    %ax,-0x10(%ebp)
f 80488ed: d9 6d f0              fldcw  -0x10(%ebp)
f 80488f0: db 5d ec              fistpl -0x14(%ebp)
f 80488f3: d9 6d f2              fldcw  -0xe(%ebp)
  80488f6: 8b 45 ec              mov    -0x14(%ebp),%eax
  80488f9: 89 c2                 mov    %eax,%edx
  80488fb: c1 fa 1f              sar    $0x1f,%edx
  80488fe: f7 d2                 not    %edx
  8048900: 21 c2                 and    %eax,%edx
  8048902: 31 c0                 xor    %eax,%eax
  8048904: 81 ea 00 01 00 00     sub    $0x100,%edx
  804890a: 85 d2                 test   %edx,%edx
  804890c: 0f 9f c0              setg   %al
  804890f: 81 e1 00 ff 00 ff     and    $0xff00ff00,%ecx
  8048915: 83 e8 01              sub    $0x1,%eax
  8048918: 81 e3 ff 00 ff 00     and    $0xff00ff,%ebx
  804891e: 21 d0                 and    %edx,%eax
  8048920: 89 f2                 mov    %esi,%edx
  8048922: 8d b8 00 01 00 00     lea    0x100(%eax),%edi
  8048928: 81 e2 ff 00 ff 00     and    $0xff00ff,%edx
  804892e: f7 d8                 neg    %eax
  8048930: 0f af d7              imul   %edi,%edx
  8048933: c1 e9 08              shr    $0x8,%ecx
  8048936: 0f af c8              imul   %eax,%ecx
  8048939: 0f af c3              imul   %ebx,%eax
  804893c: 8b 5d f4              mov    -0xc(%ebp),%ebx
  804893f: 81 e2 00 ff 00 ff     and    $0xff00ff00,%edx
  8048945: c1 ea 08              shr    $0x8,%edx
  8048948: 81 e1 00 ff 00 ff     and    $0xff00ff00,%ecx
  804894e: 01 ca                 add    %ecx,%edx
  8048950: 89 f1                 mov    %esi,%ecx
  8048952: 8b 75 f8              mov    -0x8(%ebp),%esi
  8048955: 81 e1 00 ff 00 ff     and    $0xff00ff00,%ecx
  804895b: 25 00 ff 00 ff        and    $0xff00ff00,%eax
  8048960: c1 e9 08              shr    $0x8,%ecx
  8048963: 0f af cf              imul   %edi,%ecx
  8048966: 8b 7d fc              mov    -0x4(%ebp),%edi
  8048969: 89 ec                 mov    %ebp,%esp
  804896b: c1 e8 08              shr    $0x8,%eax
s 804896e: 5d                    pop    %ebp
  804896f: 81 e1 00 ff 00 ff     and    $0xff00ff00,%ecx
  8048975: 01 ca                 add    %ecx,%edx
  8048977: 8d 04 02              lea    (%edx,%eax,1),%eax
r 804897a: c3                    ret    

mixColorsI (fixed point) is 130 bytes long
mixColorsF (floating point) is 159

 */


#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <stdlib.h>


void dbg(){} // for a non-moving breakpoint

uint mixColorsF(uint xnum1, uint xnum2, double frac);
uint mixColorsI(uint xnum1, uint xnum2, double frac);
uint mixRgbalong(uint pixel1, uint pixel2, double frac);
uint mixRgbashort(uint pixel1, uint pixel2, double frac);

int main(int argc, char** argv)
{
  dbg();
  
  if(argc != 4)
  {
    printf( "\n"
            "Experment with clamped color mixing (float and fixed point).\n\n"
            "Usage: mixcolors AARRGGBB AARRGGBB Frac\n"
            "  Where AARRGGBB are two ARGB format pixels and Frac.\n"
            "  is the distance along the gradient from the first to\n"
            "  the second.\n\n"
          );
    return 0;
  }
  
  uint xnum1 = 0, xnum2 = 0;
  double frac = 0;
  sscanf(argv[1], "%x", &xnum1);
  sscanf(argv[2], "%x", &xnum2);
  sscanf(argv[3], "%lf", &frac);
#if 0 // testing nanoseconds run 'time mixcolors'
  uint res;
  {
    int i, j,h;
    for(h = 0; h < 1000; h++)
    {
      for(i = 0; i < 1000; i++)
      {
        for(j = 0; j < 1000; j++)
          res |= mixRgbashort(xnum1, xnum2, frac);
      }
    }
  }
  printf("%x\n", res);
#endif
  
  printf("%0x\n", mixColorsI(xnum1, xnum2, frac));
  printf("%0x\n", mixColorsF(xnum1, xnum2, frac));
  printf("%0x\n", mixRgbalong(xnum1, xnum2, frac));
  printf("%0x\n", mixRgbashort(xnum1, xnum2, frac));
  
  return 0;
}

static int opaque = 0;

uint mixRgbalong(uint pixel1, uint pixel2, double frac) // , bool opaque /* true */)
{
  int a, b, c;
  uint tmp1, tmp2, res1, res2, res;
  void** xa=(void**)&a;
  void** xb=(void**)&b;
  void** xtmp1=(void**)&tmp1;
  void** xtmp2=(void**)&tmp2;
  void** xres1=(void**)&res1;
  void** xres2=(void**)&res2;
  xa=xa, xb=xb, xtmp1=xtmp1, xtmp2=xtmp2, xres1=xres1, xres2=xres2;
    
  // 8 bit fixed point multipliers
  b = 256 * frac;
  
  // these compare to 0 things where 0 and N are the possible returns run fast
  b = (b < 0 ? 0 : b); 
  
  // and add and sub are faster than a branch.
  c = b - 256;
  c = (c > 0) ? 0 : c;
  b = c + 256;
  
  a = 256 - b;
  
  // lo bits: mask, multiply, mask, shift
  tmp1 = pixel1;        
  tmp1 = tmp1 & 0x00FF00FF;
  tmp1 = tmp1 * a;
  tmp1 = tmp1 & 0xFF00FF00; 
  res1 = tmp1 >> 8;
  
  tmp2 = pixel2;
  tmp2 = tmp2 & 0x00FF00FF;
  tmp2 = tmp2 * b;
  tmp2 = tmp2 & 0xFF00FF00; 
  res1 += tmp2 >> 8;

  // hi bits: mask, shift, multiply, mask
  tmp1 = pixel1;
  tmp1 = tmp1 & 0xFF00FF00;
  tmp1 = tmp1 >> 8;
  tmp1 = tmp1 * a;
  res2 = tmp1 & 0xFF00FF00; 
  
  tmp2 = pixel2;
  tmp2 = tmp2 & 0xFF00FF00;
  tmp2 = tmp2 >> 8;
  tmp2 = tmp2 * b;
  res2 += tmp2 & 0xFF00FF00; 

  res = (res1 + res2) | ((-opaque) & 0xFF000000); // set alpha = 255 if opaque
  return res;
}

uint mixRgbashort(uint pixel1, uint pixel2, double frac) // , bool opaque /* true */)
{
  int a, b, c;
  uint res1, res2;
   
  // 8 bit fixed point multipliers
  b = 256 * frac;
  
  // these compare to 0 things where 0 and N are the possible returns run fast
  b = (b < 0 ? 0 : b); 
  
  // and add and sub are faster than a branch.
  c = b - 256;
  c = (c > 0) ? 0 : c;
  b = c + 256;
  
  a = 256 - b;
  
  // lo bits: mask, multiply, mask, shift
  res1 = (((pixel1 & 0x00FF00FF) * a) & 0xFF00FF00) >> 8;
  res1 += ((((pixel2 & 0x00FF00FF) * b) & 0xFF00FF00)) >> 8;

  // hi bits: mask, shift, multiply, mask
  res2 = (((pixel1 & 0xFF00FF00) >> 8) * a) & 0xFF00FF00; 
  res2 += (((pixel2 & 0xFF00FF00) >> 8) * b) & 0xFF00FF00; 

  return (res1 + res2) | ((-opaque) & 0xFF000000); // set alpha = 255 if opaque
}

// This is the usual implementation and it's the biggest.  160 bytes with -O2
uint mixColorsF(uint xnum1, uint xnum2, double frac)
{
  uint r1, g1, b1, a1;  
  uint r2, g2, b2, a2;
  uint res;
  double frac1, frac2;

  if(frac == 0)
    return xnum1;
  
  if (frac == 1.0)
    return xnum2;
  
  frac1 = (1 - frac); //  * 1 << 10;
  frac2 = frac;       // * 1 << 10;
  
  b1 = xnum1 & 0xff;
  g1 = (xnum1 >> 8) & 0xff;
  r1 = (xnum1 >> 16) & 0xff;
  a1 = (xnum1 >> 24) & 0xff;
  
  b2 = xnum2 & 0xff;
  g2 = (xnum2 >> 8) & 0xff;
  r2 = (xnum2 >> 16) & 0xff;
  a2 = (xnum2 >> 24) & 0xff;

  b1 = (b1 * (frac1)) + (b2 * frac2);
  g1 = (g1 * (frac1)) + (g2 * frac2);
  r1 = (r1 * (frac1)) + (r2 * frac2);
  a1 = (a1 * (frac1)) + (a2 * frac2);
  
  // clamp to 255 max
  b1 |= 255 * (b1 > 255);
  g1 |= 255 * (g1 > 255);
  r1 |= 255 * (r1 > 255);
  a1 |= 255 * (a1 > 255);
  
  res = (b1 & 0xff) | (g1 & 0xff) << 8 | (r1 & 0xff) << 16 | (a1 & 0xff) << 24;
  return res;
}

#define FIXPT 10

// This one is faster and shorter but misses the boat by two bits. ;-)  Why not
// 8 bit fixed point?  After all the result will be within 9 bits itself.
uint mixColorsI(uint xnum1, uint xnum2, double frac)
{
  uint r1, g1, b1, a1;  
  uint r2, g2, b2, a2;
  uint res;
  uint frac1, frac2;

  
  if(frac == 0)
    return xnum1;
  
  if (frac == 1.0)
    return xnum2;
  
  frac1 = (1 - frac) * (1 << FIXPT);
  frac2 = frac * (1 << FIXPT);
  
  b1 = xnum1 & 0xff;
  g1 = (xnum1 >> 8) & 0xff;
  r1 = (xnum1 >> 16) & 0xff;
  a1 = (xnum1 >> 24) & 0xff;
  
  b2 = xnum2 & 0xff;
  g2 = (xnum2 >> 8) & 0xff;
  r2 = (xnum2 >> 16) & 0xff;
  a2 = (xnum2 >> 24) & 0xff;

  b1 = ((b1 * (frac1)) + (b2 * frac2)) >> FIXPT;
  g1 = ((g1 * (frac1)) + (g2 * frac2)) >> FIXPT;
  r1 = ((r1 * (frac1)) + (r2 * frac2)) >> FIXPT;
  a1 = ((a1 * (frac1)) + (a2 * frac2)) >> FIXPT;

  // clamp to 255 max
  b1 |= 255 * (b1 > 255);
  g1 |= 255 * (g1 > 255);
  r1 |= 255 * (r1 > 255);
  a1 |= 255 * (a1 > 255);

  res = (b1 & 0xff) | (g1 & 0xff) << 8 | (r1 & 0xff) << 16 | (a1 & 0xff) << 24;
  return res;
}

// The fastest and shortest code is for the mixRgb code.  The test runs all the 
// versions and gets the same results.

More simple C and color mixing tests

Comments