This compiler issue could be reproduced using this code snippet
/* main.c */ #include <stdio.h> /* Define LEN to 1 could give the correct result, 2 or larger will give the wrong result without macro "ICL_WORKAROUND" defined in LreciprtL.c */ #define LEN 2 /* Function to calculate x^(-0.5) */ int LreciprtL(int x); static int bench_reciprt(void) { int Lsrc[LEN]; int i; for (i = 0; i < LEN; i++) Lsrc[i] = (int) (0.760045 * 2147483648.0 + 0.5); for (i = 0; i < LEN; i++) printf("in[%d]: %lf\n", i, (double) Lsrc[i] / 2147483648.0); printf("-------------------\n"); for (i = 0; i < LEN; i++) printf("out[%d]: %lf\n", i, (double) LreciprtL(Lsrc[i]) / 2147483648.0); return 0; } int main() { return bench_reciprt(); }
/* LreciprtL.c */ #include "int_math.h" /* uncomment this to enable the workaround, so the function could give the right answer, e.g. 0.760045^(-0.5) / 2 = 0.573522 (/2 is for down scale to smaller than 1.0) */ //#define ICL_WORKAROUND static const int L05 = 1073741824;
/* Calculate x^(-0.5) for 0.25 < x < 1, result in 2Q30 (down scaled by 2)*/
int LreciprtL(int x) { const int PLUSONE2Q30 = L05; const int a0 = (const int) (-3.4982 / 4 * 2147483648.0 + 0.5); const short a1 = (const short) ( 1.8077 / 4 * 32768.0 + 0.5); const int iy0 = (const int) ( 2.7260 / 4 * 2147483648.0 + 0.5); #ifdef ICL_WORKAROUND int i; #endif int a = LmacLLS(a0, x, a1); int iy = LmacLLS(iy0, x, S_L(a)); iy = LshlLU(iy, 1); #ifdef ICL_WORKAROUND for (i = 0; i < 3; i++) { a = LmpyLL(x, iy) ; a = LsubLL(PLUSONE2Q30, LshlLU(LmpyLL(a, iy), 1)) ; iy = LmacLLL(iy, a, iy) ; } #else a = LmpyLL(x, iy) ; a = LsubLL(PLUSONE2Q30, LshlLU(LmpyLL(a, iy), 1)) ; iy = LmacLLL(iy, a, iy) ; a = LmpyLL(x, iy) ; a = LsubLL(PLUSONE2Q30, LshlLU(LmpyLL(a, iy), 1)) ; iy = LmacLLL(iy, a, iy) ; a = LmpyLL(x, iy) ; a = LsubLL(PLUSONE2Q30, LshlLU(LmpyLL(a, iy), 1)) ; iy = LmacLLL(iy, a, iy) ; #endif return iy ; }
/* int_math.h */ /* Define basic math operations */ #define _asl32(a, s) ((a) * (1 << (unsigned)(s))) static __forceinline int L_A (int a) { return a + a; } static __forceinline short S_L (int a) { return (short) (a >> 16); } static __forceinline int LshlLU (int a, unsigned s) { return (int) _asl32(a, s); } static __forceinline int LsubLL(int a, int b) { return a - b; } static __forceinline int AmpyLL (int a, int c) { return (int)(((long long)a * c) >> 32); } static __forceinline int LmpyLL (int a, int c) { return L_A(AmpyLL(a, c)); } static __forceinline int AmpyLS (int a, short c) { return (int)(((long long)a * c) >> 16); } static __forceinline int LmacLLS (int a, int x, short y) { return a + L_A(AmpyLS(x, y)); } static __forceinline int LmacLLL(int a, int x, int y) { return a + LmpyLL(x, y); }
The problem is found on icl 13.1.x with MSVS 2010 or 2012, on windows 7 64 bit machine. The compiler is set to build intel64 targets, and Multi-File optimization is on (/Qipo).
Steps to reproduce the issue
unzip the attached project
open ConsoleApplication1.sln with VS2012, build release flavor.
run x64\Release>ConsoleApplication1.exe
the result would be:
in[0]: 0.760045
in[1]: 0.760045
-------------------
out[0]: -0.319917
out[1]: -0.319917
definitely wrong for x^(-0.5) which should be positive.
Ways to mitigate the issue:
1. define ICL_WORKAROUND in LreciprtL.c
2. set LEN to 1 in main.c
3. Turn off global optimization using IDE settings (set interprocedural optimization to Single file /Qip)
4. use #pragma optimize("", off) and #pragma optimize("", on) to turn off optimization around function LreciprtL() in LreciprtL.c
Either one of the 4 ways above could give the right answer:
in[0]: 0.760045
in[1]: 0.760045
-------------------
out[0]: 0.573522
out[1]: 0.573522