alitokur dot com
should you use inline?
Do you think about the overhead of a function on some long nights, or are you just normal. This night will be different my friends. This night we gonna crack some cpp codes then we are going to talk about some things that people don’t usually want to discuss. And then we’ll decide: should we use inline or not?
Here is a quote from Optimizing Software in C++:
The function call makes the microprocessor jump to different code address and back again. This may take up to 4 clock cycles. In most cases, the microprocessor is able to overlap the call and retrn operations with other calculations to save time.
Here is a simple example of a function compiled with the flags: -O2 -masm=intel
__attribute__((noinline))
int sum(int a, int b){
return a + b;
}
int main (int argc, char *argv[]) {
volatile int foo = 0;
for(int i = 0; i < 9999999; i++) {
foo += sum(i, i+1);
}
return 0;
I know, I can use -O0 to disable inlining, but I’d like to keep things closer to production setup. So instead of turning off optimization, i will use attribute((noinline)). And don’t worry this ugly volatile here, it just to prevent constant folding.
sum(int, int):
lea eax, [rdi + rsi]
ret
main:
push rbx
sub rsp, 16
mov dword ptr [rsp + 12], 0
xor edi, edi
.LBB1_1:
lea ebx, [rdi + 1]
mov esi, ebx
call sum(int, int)
add dword ptr [rsp + 12], eax
mov edi, ebx
cmp ebx, 9999999
jne .LBB1_1
xor eax, eax
add rsp, 16
pop rbx
ret
sum(int, int):
lea eax, [rdi + rsi]
ret
without inline it should be something like that:
sum(int, int):
lea eax, [rdi + rsi]
ret
main:
mov dword ptr [rsp - 4], 0
mov eax, 17
.LBB1_1:
mov ecx, dword ptr [rsp - 4]
add ecx, eax
add ecx, -16
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 14]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 12]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 10]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 8]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 6]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
lea ecx, [rax + rcx - 4]
mov dword ptr [rsp - 4], ecx
mov ecx, dword ptr [rsp - 4]
add ecx, eax
add ecx, -2
mov dword ptr [rsp - 4], ecx
add dword ptr [rsp - 4], eax
add eax, 18
cmp eax, 20000015
jne .LBB1_1
xor eax, eax
ret
End then lets check the perf results:
54.270.085 cycles
93.856.882 instructions # 1,73 insn per cycle
30.751.057 branches
62.489 branch-misses # 0,20% of all branches
0,012193552 seconds time elapsed
0,012213000 seconds user
0,000000000 seconds sys
And this is for inline:
14.141.612 cycles
38.742.668 instructions # 2,74 insn per cycle
1.977.227 branches
60.926 branch-misses # 3,08% of all branches
0,003771016 seconds time elapsed
0,003803000 seconds user
0,000000000 seconds sys