I don't get what the problem is. Testing on Linux/64 bit, GCC 4.6, -O3, -mtune=native, -msse4.1 (i.e. a very old compiler/system), this code
void dot_int(int *a, int *b, int *c, int n) {
for(int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
compiles to this inner loop:
.L4:
movdqu (%rdi,%rax), %xmm1
addl $1, %r8d
movdqu (%rsi,%rax), %xmm0
paddd %xmm1, %xmm0
movdqu %xmm0, (%rdx,%rax)
addq $16, %rax
cmpl %r8d, %r10d
ja .L4
cmpl %r9d, %ecx
je .L1
While this code
void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
for(int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
compiles to this:
.L15:
movdqu (%rbx,%rax), %xmm0
addl $1, %r8d
paddd 0(%rbp,%rax), %xmm0
movdqu %xmm0, (%r11,%rax)
addq $16, %rax
cmpl %r10d, %r8d
jb .L15
addl %r12d, %r9d
cmpl %r12d, %r13d
je .L10
As you can clearly see there's one less load. I guess it correclty estimated that there's no need to explicitely load memory before performing the sum, as the result won't overwrite anythng.
There's also room for way more optimizations -- GCC doesn't know that the parameters are f.i. 128 bit aligned, hence it must generate a huge preamble to check that there are no alignment issues (YMMV), and a postable to deal with extra unaligned parts (or less wide than 128 bits). This actually happens with both versions above. This is the complete code generated for dot_int
:
dot_int:
.LFB626:
.cfi_startproc
testl %ecx, %ecx
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
jle .L1
leaq 16(%rdx), %r11
movl %ecx, %r10d
shrl $2, %r10d
leal 0(,%r10,4), %r9d
testl %r9d, %r9d
je .L6
leaq 16(%rdi), %rax
cmpl $6, %ecx
seta %r8b
cmpq %rax, %rdx
seta %al
cmpq %r11, %rdi
seta %bl
orl %ebx, %eax
andl %eax, %r8d
leaq 16(%rsi), %rax
cmpq %rax, %rdx
seta %al
cmpq %r11, %rsi
seta %r11b
orl %r11d, %eax
testb %al, %r8b
je .L6
xorl %eax, %eax
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L4:
movdqu (%rdi,%rax), %xmm1
addl $1, %r8d
movdqu (%rsi,%rax), %xmm0
paddd %xmm1, %xmm0
movdqu %xmm0, (%rdx,%rax)
addq $16, %rax
cmpl %r8d, %r10d
ja .L4
cmpl %r9d, %ecx
je .L1
.L3:
movslq %r9d, %r8
xorl %eax, %eax
salq $2, %r8
addq %r8, %rdx
addq %r8, %rdi
addq %r8, %rsi
.p2align 4,,10
.p2align 3
.L5:
movl (%rdi,%rax,4), %r8d
addl (%rsi,%rax,4), %r8d
movl %r8d, (%rdx,%rax,4)
addq $1, %rax
leal (%r9,%rax), %r8d
cmpl %r8d, %ecx
jg .L5
.L1:
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L6:
.cfi_restore_state
xorl %r9d, %r9d
jmp .L3
.cfi_endproc
Now in your case the ints effectively not aligned (as they're on the stack), but if you can make them aligned and tell GCC so, then you can improve code generation:
typedef int intvec __attribute__((vector_size(16)));
void dot_int_restrict_alig(intvec * restrict a,
intvec * restrict b,
intvec * restrict c,
unsigned int n) {
for(unsigned int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
This generates this code, with no preamble:
dot_int_restrict_alig:
.LFB628:
.cfi_startproc
testl %ecx, %ecx
je .L23
subl $1, %ecx
xorl %eax, %eax
addq $1, %rcx
salq $4, %rcx
.p2align 4,,10
.p2align 3
.L25:
movdqa (%rdi,%rax), %xmm0
paddd (%rsi,%rax), %xmm0
movdqa %xmm0, (%rdx,%rax)
addq $16, %rax
cmpq %rcx, %rax
jne .L25
.L23:
rep
ret
.cfi_endproc
Note the usage of the aligned 128 bit load instructions (movdqa
, a as aligned, vs movdqu
, unaligned).