Nominal Animal |
06-25-2012 09:41 PM |
Quote:
Originally Posted by ntubski
(Post 4711314)
I was under the impression that the compiler can be trusted to do this kind of optimization, but I just checked with gcc 4.5.2 (32bit), and it only avoids the division. The version with multiplication does use fewer instructions, perhaps gcc decided it would be faster?
|
In normal userspace application code I never "optimize" multiplications to shifts and additions/subtractions (since profiling never points to that kind of code to be a bottleneck). The compilers do a good enough job, and readability is so much more important there.
In kernel and microcontroller code this tends to matter, and we can use more complicated expressions, requiring more skill from developers.
On x86-64 (Athlon II X4 640), using gcc -W -Wall -O3 -fomit-frame-pointer, the two functions
Code:
#include <stdint-gcc.h>
uint32_t running_average_1(const uint32_t avg, const uint32_t val)
{
return (uint32_t)( ( (((uint64_t)avg) << 8U) - (uint64_t)avg + (uint64_t)val ) >> 8U );
}
uint32_t running_average_2(const uint32_t avg, const uint32_t val)
{
return (uint32_t)( ( ((uint64_t)avg * (uint64_t)255U) + (uint64_t)val ) / (uint64_t)256U );
}
compile to exact same code:
Code:
movl %edi, %edi
movl %esi, %eax
movq %rdi, %rdx
salq $8, %rdx
subq %rdi, %rdx
addq %rdx, %rax
shrq $8, %rax
ret
Adding -m32 makes gcc-4.6.3 stupid; it compiles the equivalent only for the multiplication case. gcc-4.4 -m32 -W -Wall -O3 -fomit-frame-pointer compiles the shift case to an explicit shift:
Code:
running_average_1:
pushl %ebx
movl 8(%esp), %ecx
xorl %ebx, %ebx
movl %ebx, %edx
movl %ecx, %eax
shldl $8, %eax, %edx
sall $8, %eax
subl %ecx, %eax
movl 12(%esp), %ecx
sbbl %ebx, %edx
xorl %ebx, %ebx
addl %ecx, %eax
adcl %ebx, %edx
shrdl $8, %edx, %eax
popl %ebx
ret
and the explicit multiplication to a multiplication:
Code:
running_average_2:
subl $20, %esp
movl $255, %ecx
movl 24(%esp), %eax
movl %ebx, 12(%esp)
movl 28(%esp), %ebx
movl %esi, 16(%esp)
xorl %esi, %esi
mull %ecx
addl %eax, %ebx
adcl %edx, %esi
shrdl $8, %esi, %ebx
movl 16(%esp), %esi
movl %ebx, %eax
movl 12(%esp), %ebx
addl $20, %esp
ret
with both running in the exact same time (67 cycles including the function call overhead) for random 32-bit unsigned inputs. (Note that gcc-4.6.3 generates consistently one or two clock cycles slower code!)
On the other hand, compiling it using
Code:
avr-gcc-4.5.3 -mmcu=avr2 avg.c -O3 -S
yields the following assembly:
Code:
.file "avg.c"
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__CCP__ = 0x34
__tmp_reg__ = 0
__zero_reg__ = 1
.text
.global overhead
.type overhead, @function
overhead:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
ldi r22,lo8(0)
ldi r23,hi8(0)
ldi r24,hlo8(0)
ldi r25,hhi8(0)
/* epilogue start */
ret
.size overhead, .-overhead
.global running_average_1
.type running_average_1, @function
running_average_1:
push r4
push r5
push r6
push r7
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
/* prologue: function */
/* frame size = 0 */
/* stack size = 14 */
.L__stack_usage = 14
mov r4,r18
mov r5,r19
mov r6,r20
mov r7,r21
mov r8,r22
mov r9,r23
mov r10,r24
mov r11,r25
mov r18,r22
mov r19,r9
mov r20,r10
mov r21,r11
ldi r22,lo8(0)
ldi r23,lo8(0)
ldi r24,lo8(0)
ldi r25,lo8(0)
ldi r16,lo8(8)
rcall __ashldi3
mov r30,r18
sub r30,r8
ldi r26,lo8(1)
cp r18,r30
brlo .L3
ldi r26,lo8(0)
.L3:
mov r31,r19
sub r31,r9
ldi r18,lo8(1)
cp r19,r31
brlo .L4
ldi r18,lo8(0)
.L4:
mov r19,r31
sub r19,r26
mov r26,r19
ldi r19,lo8(1)
cp r31,r26
brlo .L5
ldi r19,lo8(0)
.L5:
or r18,r19
mov r27,r20
sub r27,r10
ldi r19,lo8(1)
cp r20,r27
brlo .L6
ldi r19,lo8(0)
.L6:
mov r31,r27
sub r31,r18
ldi r18,lo8(1)
cp r27,r31
brlo .L7
ldi r18,lo8(0)
.L7:
or r19,r18
mov r20,r21
sub r20,r11
ldi r18,lo8(1)
cp r21,r20
brlo .L8
ldi r18,lo8(0)
.L8:
mov r15,r20
sub r15,r19
ldi r19,lo8(1)
cp r20,r15
brlo .L9
ldi r19,lo8(0)
.L9:
or r18,r19
mov r16,r22
sub r16,r18
ldi r18,lo8(1)
cp r22,r16
brlo .L11
ldi r18,lo8(0)
.L11:
mov r17,r23
sub r17,r18
ldi r18,lo8(1)
cp r23,r17
brlo .L13
ldi r18,lo8(0)
.L13:
mov r27,r24
sub r27,r18
ldi r18,lo8(1)
cp r24,r27
brlo .L15
ldi r18,lo8(0)
.L15:
sub r25,r18
mov r18,r30
add r18,r4
ldi r19,lo8(1)
cp r18,r30
brlo .L16
ldi r19,lo8(0)
.L16:
mov r24,r26
add r24,r5
ldi r20,lo8(1)
cp r24,r26
brlo .L17
ldi r20,lo8(0)
.L17:
add r19,r24
ldi r21,lo8(1)
cp r19,r24
brlo .L18
ldi r21,lo8(0)
.L18:
or r20,r21
mov r24,r31
add r24,r6
ldi r21,lo8(1)
cp r24,r31
brlo .L19
ldi r21,lo8(0)
.L19:
add r20,r24
ldi r22,lo8(1)
cp r20,r24
brlo .L20
ldi r22,lo8(0)
.L20:
or r21,r22
mov r24,r15
add r24,r7
ldi r23,lo8(1)
cp r24,r15
brlo .L21
ldi r23,lo8(0)
.L21:
add r21,r24
ldi r22,lo8(1)
cp r21,r24
brlo .L22
ldi r22,lo8(0)
.L22:
or r22,r23
add r22,r16
ldi r23,lo8(1)
cp r22,r16
brlo .L24
ldi r23,lo8(0)
.L24:
add r23,r17
ldi r24,lo8(1)
cp r23,r17
brlo .L26
ldi r24,lo8(0)
.L26:
add r24,r27
ldi r30,lo8(1)
cp r24,r27
brlo .L28
ldi r30,lo8(0)
.L28:
add r25,r30
ldi r16,lo8(8)
rcall __lshrdi3
mov r25,r21
mov r22,r18
mov r23,r19
mov r24,r20
/* epilogue start */
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
ret
.size running_average_1, .-running_average_1
.global running_average_2
.type running_average_2, @function
running_average_2:
push r6
push r7
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
push r29
push r28
in r28,__SP_L__
in r29,__SP_H__
sbiw r28,8
in __tmp_reg__,__SREG__
cli
out __SP_H__,r29
out __SREG__,__tmp_reg__
out __SP_L__,r28
/* prologue: function */
/* frame size = 8 */
/* stack size = 22 */
.L__stack_usage = 22
mov r10,r18
mov r11,r19
mov r12,r20
mov r13,r21
clr r14
clr r15
clr r16
clr r17
std Y+1,r18
std Y+2,r11
std Y+3,r12
std Y+4,r13
std Y+5,r14
std Y+6,r15
std Y+7,r16
std Y+8,r17
mov r6,r22
mov r7,r23
mov r8,r24
mov r9,r25
mov r18,r22
mov r19,r7
mov r20,r8
mov r21,r9
ldi r22,lo8(0)
ldi r23,lo8(0)
ldi r24,lo8(0)
ldi r25,lo8(0)
ldi r16,lo8(4)
rcall __ashldi3
mov r15,r18
sub r15,r6
ldi r31,lo8(1)
cp r18,r15
brlo .L31
ldi r31,lo8(0)
.L31:
mov r30,r19
sub r30,r7
ldi r18,lo8(1)
cp r19,r30
brlo .L32
ldi r18,lo8(0)
.L32:
mov r17,r30
sub r17,r31
ldi r19,lo8(1)
cp r30,r17
brlo .L33
ldi r19,lo8(0)
.L33:
or r18,r19
mov r30,r20
sub r30,r8
ldi r19,lo8(1)
cp r20,r30
brlo .L34
ldi r19,lo8(0)
.L34:
mov r14,r30
sub r14,r18
ldi r18,lo8(1)
cp r30,r14
brlo .L35
ldi r18,lo8(0)
.L35:
or r19,r18
mov r20,r21
sub r20,r9
ldi r18,lo8(1)
cp r21,r20
brlo .L36
ldi r18,lo8(0)
.L36:
mov r13,r20
sub r13,r19
ldi r19,lo8(1)
cp r20,r13
brlo .L37
ldi r19,lo8(0)
.L37:
or r18,r19
mov r12,r22
sub r12,r18
ldi r18,lo8(1)
cp r22,r12
brlo .L39
ldi r18,lo8(0)
.L39:
mov r11,r23
sub r11,r18
ldi r18,lo8(1)
cp r23,r11
brlo .L41
ldi r18,lo8(0)
.L41:
mov r10,r24
sub r10,r18
ldi r18,lo8(1)
cp r24,r10
brlo .L43
ldi r18,lo8(0)
.L43:
mov r9,r25
sub r9,r18
mov r18,r15
mov r19,r17
mov r20,r14
mov r21,r13
mov r22,r12
mov r23,r11
mov r24,r10
mov r25,r9
ldi r16,lo8(4)
rcall __ashldi3
add r18,r15
ldi r31,lo8(1)
cp r18,r15
brlo .L44
ldi r31,lo8(0)
.L44:
add r19,r17
ldi r30,lo8(1)
cp r19,r17
brlo .L45
ldi r30,lo8(0)
.L45:
mov r15,r31
add r15,r19
ldi r31,lo8(1)
cp r15,r19
brlo .L46
ldi r31,lo8(0)
.L46:
or r30,r31
add r20,r14
ldi r27,lo8(1)
cp r20,r14
brlo .L47
ldi r27,lo8(0)
.L47:
mov r17,r30
add r17,r20
ldi r19,lo8(1)
cp r17,r20
brlo .L48
ldi r19,lo8(0)
.L48:
or r27,r19
add r21,r13
ldi r26,lo8(1)
cp r21,r13
brlo .L49
ldi r26,lo8(0)
.L49:
mov r13,r27
add r13,r21
ldi r19,lo8(1)
cp r13,r21
brlo .L50
ldi r19,lo8(0)
.L50:
or r26,r19
add r22,r12
ldi r30,lo8(1)
cp r22,r12
brlo .L51
ldi r30,lo8(0)
.L51:
mov r14,r26
add r14,r22
ldi r19,lo8(1)
cp r14,r22
brlo .L52
ldi r19,lo8(0)
.L52:
or r30,r19
add r23,r11
ldi r31,lo8(1)
cp r23,r11
brlo .L53
ldi r31,lo8(0)
.L53:
add r30,r23
ldi r19,lo8(1)
cp r30,r23
brlo .L54
ldi r19,lo8(0)
.L54:
or r31,r19
add r24,r10
ldi r19,lo8(1)
cp r24,r10
brlo .L55
ldi r19,lo8(0)
.L55:
add r31,r24
ldi r20,lo8(1)
cp r31,r24
brlo .L56
ldi r20,lo8(0)
.L56:
or r19,r20
add r25,r9
add r25,r19
ldd r11,Y+1
add r18,r11
ldi r19,lo8(1)
cp r18,r11
brlo .L57
ldi r19,lo8(0)
.L57:
ldd r16,Y+2
add r15,r16
ldi r20,lo8(1)
cp r15,r16
brlo .L58
ldi r20,lo8(0)
.L58:
add r19,r15
ldi r24,lo8(1)
cp r19,r15
brlo .L59
ldi r24,lo8(0)
.L59:
or r20,r24
ldd r26,Y+3
add r17,r26
ldi r21,lo8(1)
cp r17,r26
brlo .L60
ldi r21,lo8(0)
.L60:
add r20,r17
ldi r24,lo8(1)
cp r20,r17
brlo .L61
ldi r24,lo8(0)
.L61:
or r21,r24
ldd r23,Y+4
add r23,r13
ldi r22,lo8(1)
ldd r10,Y+4
cp r23,r10
brlo .L62
ldi r22,lo8(0)
.L62:
add r21,r23
ldi r24,lo8(1)
cp r21,r23
brlo .L63
ldi r24,lo8(0)
.L63:
or r22,r24
ldi r23,lo8(1)
ldd r11,Y+5
cp r14,r11
brlo .L64
ldi r23,lo8(0)
.L64:
add r22,r14
ldi r24,lo8(1)
cp r22,r14
brlo .L65
ldi r24,lo8(0)
.L65:
or r23,r24
ldi r24,lo8(1)
ldd r12,Y+6
cp r30,r12
brlo .L66
ldi r24,lo8(0)
.L66:
add r23,r30
ldi r26,lo8(1)
cp r23,r30
brlo .L67
ldi r26,lo8(0)
.L67:
or r24,r26
ldi r30,lo8(1)
ldd r13,Y+7
cp r31,r13
brlo .L68
ldi r30,lo8(0)
.L68:
add r24,r31
ldi r26,lo8(1)
cp r24,r31
brlo .L69
ldi r26,lo8(0)
.L69:
or r30,r26
add r25,r30
ldi r16,lo8(8)
rcall __lshrdi3
mov r25,r21
mov r22,r18
mov r23,r19
mov r24,r20
/* epilogue start */
adiw r28,8
in __tmp_reg__,__SREG__
cli
out __SP_H__,r29
out __SREG__,__tmp_reg__
out __SP_L__,r28
pop r28
pop r29
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop r7
pop r6
ret
.size running_average_2, .-running_average_2
Now, avr2 is an 8-bit instruction set, which should explain why the assembly is so long. Multi-register shifts are done in the __ashldi3()/__lshldi3() and __ashrdi3() gcc-built-in functions. Even without going into the details, the code with the explicit shifts (red) is shorter and simpler: just 20 conditional jumps. In comparison, the code with the explicit multiplication (blue) is a lot longer and more complex, something like 36 conditional jumps.
Using hand assembly on an 8-bit architecture, all you really need is the logical equivalent of
Code:
# avg[0] is the low byte of avg, avg[3] is the highest byte
# val[0] is the low byte of val, val[3] is the highest byte
# res[0] is the low byte of result, res[3] is the highest byte
set a0 = 0
set a1 = avg[0]
set a2 = avg[1]
set a3 = avg[2]
set a4 = avg[3]
add res[0] to a0
add res[1] and carry to a1
add res[2] and carry to a2
add res[3] and carry to a3
add 0 and carry to a4
subtract avg[0] from a0
subtract avg[1] and carry from a1
subtract avg[2] and carry from a2
subtract avg[3] and carry from a3
subtract 0 and carry from a4
set res[0] = a1
set res[1] = a2
set res[2] = a3
set res[3] = a4
where carry is the carry flag. All CPUs and microcontrollers do have a carry flag; addition and subtraction with or without carry are usually separate mnemonics. Using avr2 MCU, the above code takes about twenty lines or so, and is at least an order of magnitude faster than the C code.
It is such a drastic improvement compared to the C code, that I'm wondering if I should have used the explicit multiplication only after all: on any limited architectures the code is better written in hand assembly (perhaps inlined, in C source files).
To be honest, I've only ever seen the division case being handled stupidly by the compiler, never the multiplication case. (If you ever run into a reference to __udivdi3(), you'll know what I mean.) That is why I used the explicit shift. Now I think I probably should not have, since the multiplication code is much more readable, and just as good for the generic cases. On the difficult architectures like avr2, writing it in inline assembly gives you an order of magnitude simpler and faster code, so tweaking the little bits is insignificant in comparison.
In other words, a very good point, ntubski. I guess it was an ill-advised premature optimization on my part.
|