Dear all, I wrote some code for testing FMA instructions, compiled in IDE vs 2010( of course using Intel c++ toolset), and there are not any FMA instructions generated. Then I compiled the same code in vs 2015, using the same Intel toolset, with same project configuration, and it generate FMA instructions successfully. I use Intel Parallel studio XE 2016 cluster
Is there any difference between vs2010 and vs2015?
Below is code:
#include "stdafx.h" #include <Windows.h> #include <immintrin.h> bool fma_test() { __m256 mma, mmb, mmc; float a[8], b[8], c[8]; for (int i = 0; i < 8; ++i) { a[i] = i; b[i] = i; c[i] = i; } mma = _mm256_load_ps(a); mmb = _mm256_load_ps(b); mmc = _mm256_load_ps(c); __m256 ret = _mm256_fmadd_ps(mma, mmb, mmc); if(ret.m256_f32[7] == 56.0) return true; return false; } int main() { if (fma_test()) { printf("true"); } else printf("false"); system("pause"); return 0; }
Below is disassembly in VS2010 and 2015:
VS2010:
000000013FD30FFC add byte ptr [rax],al
000000013FD30FFE add byte ptr [rax],al
--- D:\sl\XR\MoFangG\xfma\xfma.cpp ---------------------------------------------
return true;
return false;
}
int main()
{
000000013FD31000 sub rsp,78h
000000013FD31004 mov edx,9D9FFEh
000000013FD31009 mov qword ptr [rsp+60h],r13
000000013FD3100E lea r13,[rsp+3Fh]
000000013FD31013 mov ecx,3
000000013FD31018 and r13,0FFFFFFFFFFFFFFE0h
000000013FD3101C mov rax,qword ptr [__security_cookie (13FD36000h)]
000000013FD31023 xor rax,rsp
000000013FD31026 mov qword ptr [rsp+70h],rax
000000013FD3102B call __intel_new_feature_proc_init (13FD318F0h)
000000013FD31030 vstmxcsr dword ptr [rsp+68h]
000000013FD31036 or dword ptr [rsp+68h],8040h
000000013FD3103E vldmxcsr dword ptr [rsp+68h]
// xfma.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <Windows.h>
#include <immintrin.h>
bool fma_test()
{
__m256 mma, mmb, mmc;
float a[8], b[8], c[8];
for (int i = 0; i < 8; ++i)
{
a[i] = i;
000000013FD31044 vcvtdq2ps ymm0,ymmword ptr [__xi_z+30h (13FD33220h)]
b[i] = i;
c[i] = i;
}
mma = _mm256_load_ps(a);
mmb = _mm256_load_ps(b);
mmc = _mm256_load_ps(c);
__m256 ret = _mm256_fmadd_ps(mma, mmb, mmc);
000000013FD3104C db c4h
000000013FD3104D loop main+0CCh (13FD310CCh)
000000013FD3104F test al,0C0h
000000013FD31051 vmovups ymmword ptr [rbp],ymm0
if(ret.m256_f32[7] == 56.0)
000000013FD31057 vmovss xmm1,dword ptr [rbp+1Ch]
000000013FD3105D vucomiss xmm1,dword ptr [__xi_z+50h (13FD33240h)]
000000013FD31065 jp main+69h (13FD31069h)
000000013FD31067 je main+9Fh (13FD3109Fh)
}
else
printf("false");
000000013FD31069 lea rcx,[__xi_z+54h (13FD33244h)]
000000013FD31070 vzeroupper
000000013FD31073 call qword ptr [__imp_printf (13FD33180h)]
system("pause");
000000013FD31079 lea rcx,[__xi_z+5Ah (13FD3324Ah)]
000000013FD31080 call qword ptr [__imp_system (13FD330E0h)]
return 0;
000000013FD31086 mov rcx,qword ptr [rsp+70h]
000000013FD3108B xor rcx,rsp
000000013FD3108E call __security_check_cookie (13FD310D0h)
000000013FD31093 mov r13,qword ptr [rsp+60h]
000000013FD31098 xor eax,eax
000000013FD3109A add rsp,78h
000000013FD3109E ret
if (fma_test())
{
printf("true");
000000013FD3109F lea rcx,[__xi_z+60h (13FD33250h)]
000000013FD310A6 vzeroupper
000000013FD310A9 call qword ptr [__imp_printf (13FD33180h)]
000000013FD310AF jmp main+79h (13FD31079h)
000000013FD310B1 nop dword ptr [rax+rax]
000000013FD310B9 nop dword ptr [rax]
--- No source file -------------------------------------------------------------
000000013FD310C0 int 3
000000013FD310C1 int 3
000000013FD310C2 int 3
000000013FD310C3 int 3
000000013FD310C4 int 3
000000013FD310C5 int 3
000000013FD310C6 nop word ptr [rax+rax]
__security_check_cookie:
000000013FD310D0 cmp rcx,qword ptr [__security_cookie (13FD36000h)]
000000013FD310D7 jne ReportFailure (13FD310EAh)
000000013FD310D9 rol rcx,10h
000000013FD310DD test cx,0FFFFh
000000013FD310E2 jne RestoreRcx (13FD310E6h)
000000013FD310E4 rep ret
RestoreRcx:
000000013FD310E6 ror rcx,10h
ReportFailure:
000000013FD310EA jmp __report_gsfailure (13FD31440h)
000000013FD310EF int 3
__GSHandlerCheckCommon:
000000013FD310F0 push rbx
000000013FD310F2 sub rsp,20h
000000013FD310F6 mov r11d,dword ptr [r8]
000000013FD310F9 mov rbx,rdx
000000013FD310FC mov r9,rcx
000000013FD310FF and r11d,0FFFFFFF8h
000000013FD31103 test byte ptr [r8],4
000000013FD31107 mov r10,rcx
000000013FD3110A je __GSHandlerCheckCommon+2Fh (13FD3111Fh)
000000013FD3110C mov eax,dword ptr [r8+8]
000000013FD31110 movsxd r10,dword ptr [r8+4]
000000013FD31114 neg eax
000000013FD31116 add r10,rcx
000000013FD31119 movsxd rcx,eax
000000013FD3111C and r10,rcx
000000013FD3111F movsxd rax,r11d
000000013FD31122 mov rdx,qword ptr [rax+r10]
000000013FD31126 mov rax,qword ptr [rbx+10h]
000000013FD3112A mov ecx,dword ptr [rax+8]
000000013FD3112D add rcx,qword ptr [rbx+8]
000000013FD31131 test byte ptr [rcx+3],0Fh
000000013FD31135 je __GSHandlerCheckCommon+53h (13FD31143h)
000000013FD31137 movzx eax,byte ptr [rcx+3]
000000013FD3113B and eax,0FFFFFFF0h
000000013FD3113E cdqe
000000013FD31140 add r9,rax
000000013FD31143 xor r9,rdx
000000013FD31146 mov rcx,r9
000000013FD31149 add rsp,20h
000000013FD3114D pop rbx
000000013FD3114E jmp __security_check_cookie (13FD310D0h)
000000013FD31153 int 3
__GSHandlerCheck:
000000013FD31154 sub rsp,28h
000000013FD31158 mov r8,qword ptr [r9+38h]
000000013FD3115C mov rcx,rdx
000000013FD3115F mov rdx,r9
000000013FD31162 call __GSHandlerCheckCommon (13FD310F0h)
000000013FD31167 mov eax,1
000000013FD3116C add rsp,28h
000000013FD31170 ret
000000013FD31171 int 3
000000013FD31172 int 3
000000013FD31173 int 3
--- f:\dd\vctools\crt_bld\self_64_amd64\crt\src\crtexe.c -----------------------
VS2015
c[i] = i;
000000013F0B1098 vmovaps xmmword ptr [rsp+0B0h],xmm0
}
mma = _mm256_load_ps(a);
000000013F0B10A1 vmovaps ymm1,ymmword ptr [rsp+70h]
mmb = _mm256_load_ps(b);
000000013F0B10A7 vmovaps ymm0,ymmword ptr [rsp+90h]
c[i] = i;
000000013F0B10B0 vmovaps xmmword ptr [rsp+0C0h],xmm5
mmc = _mm256_load_ps(c);
__m256 ret = _mm256_fmadd_ps(mma, mmb, mmc);
000000013F0B10B9 vfmadd213ps ymm1,ymm0,ymmword ptr [rsp+0B0h]
000000013F0B10C3 vmovaps ymmword ptr [r13],ymm1
if(ret.m256_f32[7] == 56.0)
000000013F0B10C9 vmovss xmm2,dword ptr [r13+1Ch]
if (fma_test())
{
printf("true");
}
else
{
printf("false");
000000013F0B10CF vucomiss xmm2,dword ptr [__xt_z+18h (013F0B4290h)]
if (fma_test())