Hello,
I am running a simple program to test the vectorization optimization of Intel compilers.
I am comparing both C++ and Fortran language for this.
C++ Code (test.cpp):
#include <iostream> #include <ctime> int main() { const int Nx=1500, Ny=800, N=5; int i,j,t; float Q[Nx][Ny], Q0[Nx][Ny], Q1[Nx][Ny], Q2[Nx][Ny], Q3[Nx][Ny]; float A[Nx][Ny]; float B[Nx][Ny]; float iniA, iniB; clock_t t1, t2; std::cin >> iniA; std::cin >> iniB; for (i=0; i<Nx; i++) { for (j=0; j<Ny; j++) { Q[i][j] = 0.0f; Q0[i][j] = 0.0f; Q1[i][j] = 0.0f; Q2[i][j] = 0.0f; Q3[i][j] = 0.0f; A[i][j] = iniA; B[i][j] = iniB; } } t1 = clock(); for (t=0; t<2000; t++) { for (i=0; i<Nx; i++) { for (j=0; j<Ny; j++) { Q[i][j] = 2.0f*A[i][j] + 3.0f*B[i][j]; Q0[i][j] = 2.0f*A[i][j] - 3.0f*B[i][j]; Q1[i][j] = 4.0f*A[i][j] - 3.0f*B[i][j]; Q2[i][j] = 8.0f*A[i][j] + 3.0f*B[i][j]; Q3[i][j] = 26.0f*A[i][j] - 3.0f*B[i][j]; } } } t2 = clock(); std::cout << "T: "<< 1.0f*(t2-t1)/CLOCKS_PER_SEC << std::endl; std::cout << "Res: "<< Q[0][0] << ""<< Q0[0][0] << ""<< Q1[0][0] << ""<< Q2[0][0] << ""<< Q3[0][0] << std::endl; return 0; }
Fortran Code (test.f90):
PROGRAM test integer :: Nx=1500, Ny=800, N=5, i ,j ,k, t REAL, dimension (:,:), allocatable :: Q, Q0, Q1, Q2, Q3, A, B REAL T1, T2, iniA, iniB READ(*,*) iniA READ(*,*) iniB ALLOCATE(Q(Nx,Ny),Q0(Nx,Ny),Q1(Nx,Ny),Q2(Nx,Ny),Q3(Nx,Ny)) ALLOCATE(A(Nx,Ny),B(Nx,Ny)) DO j = 1, Ny DO i = 1, Nx Q(i,j) = 0.0 Q0(i,j) = 0.0 Q1(i,j) = 0.0 Q2(i,j) = 0.0 Q3(i,j) = 0.0 A(i,j) = iniA B(i,j) = iniB ENDDO ENDDO CALL CPU_TIME(T1) DO t = 1, 2000 DO j = 1, Ny DO i = 1, Nx Q(i,j) = 2.0*A(i,j) + 3.0*B(i,j) Q0(i,j) = 2.0*A(i,j) - 3.0*B(i,j) Q1(i,j) = 4.0*A(i,j) - 3.0*B(i,j) Q2(i,j) = 8.0*A(i,j) + 3.0*B(i,j) Q3(i,j) = 26.0*A(i,j) - 3.0*B(i,j) ENDDO ENDDO ENDDO CALL CPU_TIME(T2) WRITE(*,*) "T: ", 1.0*(T2-T1); WRITE(*,*) "Res: ", Q(1,1),Q0(1,1),Q1(1,1),Q2(1,1),Q3(1,1) END PROGRAM test
These two program are compiled with and without the vectorization using O3 optimization.
icpc -O3 -vec-report2 test.cpp ; icpc -O3 -no-vec test.cpp ; ifort -O3 -vec-report2 test.f90 ; ifort -O3 -no-vec test.cpp
These 4 programs ran on an Intel X7560 and the results are:
Fortran No Vectorization : 9.5s
Fortran Vectorized : 5.8s
C++ No Vectorization : 7.1s
C++ Vectorized : 40.8s
The vectorization in C++ increase the time of computation by 400%. If I look at the vectorization report, I see that the inner loop (l.34) was not vectorized but the outer loop (l.33) is.
test.cpp(18): (col. 5) remark: LOOP WAS VECTORIZED
test.cpp(33): (col. 2) remark: LOOP WAS VECTORIZED
test.cpp(31): (col. 5) remark: loop was not vectorized: not inner loop
I don't understand this automatization, the i-loop is not contiguous in memory! I tried to impose the vectorization using simd flag (#pragma simd) but the outer loop is always vectorized... The problem did not happen using Fortran (the most inner loop was vectorized).
I did not find example with multidimensional array and vectorization on the internet or in the Intel website. I don't even know if it is possible to use in C++ (at least it works in Fortran...). Do you any solution to this problem ?
Thank you
P.S.: I am working on cloud computing (fluid mechanical engineering) and I am trying to optimize my code.