Ok, so this is the code:
/// Integral processing. MNOALIAS MFORCEINLINE ValueType GetFrom01Cubic_Integral(MUINT32 posf) const { // Get the new wavetable index and interpolation coefficient. const int index = posf >> Integral_FloatBits; static const float Integral_Coef = float(1.0 / (1 << Integral_FloatBits)); const float x = float(int(posf & (MFloatINT32::Max >> PrecomputeCountLog)) * Integral_Coef); // Get output sample. const float y0 = PrecomputedPtr[index-1]; const float y1 = PrecomputedPtr[index+0]; const float y2 = PrecomputedPtr[index+1]; const float y3 = PrecomputedPtr[index+2]; return MInterpolation::GetCubic(y0, y1, y2, y3, x); // For some reason this doesn't get vectorized... //return MInterpolation::GetCubic(PrecomputedPtr[index-1], PrecomputedPtr[index+0], PrecomputedPtr[index+1], PrecomputedPtr[index+2], x); }; /// Retrieves multiple samples of the generator at specified positions from interval 0..1 (even excluding the range). /// Returns new position after the interval. template<class type> MNOALIAS double GetFrom01Cubic(MNOALIAS type* dst, double pos, double posinc, int cnt) const { MFloatINT32 posf = pos, posincf = posinc; MVECTORALWAYS for (int i=0; i<cnt; i++) { dst[i] = GetFrom01Cubic_Integral(posf); posf += posincf; }; // Get new position. pos = posf.GetFloat(); return pos; };
And this is where it crashed:
00007FFEF307E151 vpaddd xmm3,xmm3,xmm1 00007FFEF307E155 vmovd r15,xmm13 00007FFEF307E15A mov edi,r11d 00007FFEF307E15D shr r11,20h 00007FFEF307E161 mov r12d,r15d 00007FFEF307E164 shr r15,20h00007FFEF307E168 vmovd xmm15,dword ptr [rbx+rdi*4+0Ch] <<<<<<<<<<<<<<<< 00007FFEF307E16E vmovd xmm11,dword ptr [rbx+r11*4+0Ch] 00007FFEF307E175 vmovd xmm6,dword ptr [rbx+r12*4+0Ch] 00007FFEF307E17C vmovd xmm14,dword ptr [rbx+r15*4+0Ch] 00007FFEF307E183 vmovd xmm10,dword ptr [rbx+r15*4+8] 00007FFEF307E18A vinsertf128 ymm12,ymm7,xmm8,1 00007FFEF307E190 vcvtdq2ps ymm5,ymm12 00007FFEF307E195 vpunpcklqdq xmm8,xmm15,xmm11
For simple reason - rdi = 0xFFFFFFFF, which indeed is -1 in 32-bit, but not in 64-bit environment... It worked fine with 2015 upd 3 (or which was it).
Really ICC is like a minefield... If MSVC would have a good dispatchable vectorizer, there would be no need to ICC at all...