Hi,
I have a program which does 1-byte vector sum using SSE intrinsics.
When I compile it with Intel compiler (ICC 15.0.1, x86_64, Linux, SandyBridge CPU) it segfaults, it looks like the end condition of the loop is not checked correctly. Same code works with GCC.
My only optimization flag -s "-O3" (When compiling with -O1 the program works).
#include <stdio.h> #include <string.h> #include <stdint.h> #include <stdlib.h> #include <xmmintrin.h> #include <emmintrin.h> static inline void add_single(void *dst, void *src) { *((int8_t*)dst) += *((int8_t*)src); } __attribute__((noinline)) void vector_sum_char(void *dst, void *src, unsigned length) { const unsigned factor = sizeof(__m128i); __m128i s[1], d[1]; int i, j; i = 0; while(i < (int)(length - factor + 1)) { printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1)); j = 0; d[j] = (__m128i)_mm_loadu_si128(dst); s[j] = (__m128i)_mm_loadu_si128(src); src += sizeof(__m128i); j = 0; _mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j])); dst += sizeof(__m128i); i += factor; } for (i = 0; i < (length%factor); ++i) { add_single(dst, src); ++dst; ++src; } } int main(int argc, char **argv) { int num_elems = 17; void *src = calloc(1, num_elems); void *dst = calloc(1, num_elems); vector_sum_char(dst, src, num_elems); free(src); free(dst); return 0; }
Output looks like this:
src=0x6122e0 dst=0x612300 i=0 max=2 src=0x6122f0 dst=0x612310 i=16 max=2 src=0x612300 dst=0x612320 i=32 max=2 src=0x612310 dst=0x612330 i=48 max=2 src=0x612320 dst=0x612340 i=64 max=2 src=0x612330 dst=0x612350 i=80 max=2 src=0x612340 dst=0x612360 i=96 max=2 src=0x612350 dst=0x612370 i=112 max=2 ... Segmentation fault (core dumped) gdb: Program received signal SIGSEGV, Segmentation fault. vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j])); (gdb) bt #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) #1 0x00000000004015ff in main (argc=-10336, argv=0x0) (gdb) f 0 #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j])); 0x000000000040167f <+95>: mov %rbp,%rsi 0x0000000000401682 <+98>: mov %rbx,%rdx 0x0000000000401685 <+101>: mov %r12d,%ecx 0x0000000000401688 <+104>: mov %r15d,%r8d 0x000000000040168b <+107>: xor %eax,%eax 0x000000000040168d <+109>: callq 0x401338 <printf@plt> => 0x0000000000401692 <+114>: movdqu (%rbx),%xmm1 0x0000000000401696 <+118>: movdqu 0x0(%rbp),%xmm0 0x000000000040169b <+123>: paddb %xmm0,%xmm1 0x000000000040169f <+127>: inc %r14d 0x00000000004016a2 <+130>: movdqu %xmm1,(%rbx) 0x00000000004016a6 <+134>: add $0x10,%rbp 0x00000000004016aa <+138>: add $0x10,%rbx 0x00000000004016ae <+142>: add $0x10,%r12d 0x00000000004016b2 <+146>: cmp %r13d,%r14d 0x00000000004016b5 <+149>: jl 0x40167a <vector_sum_char+90>
please note here that in address <+146> r13d is compared, but on <+101> and <+142> r12d is being used as "i".