I'm currently experiencing some performance problems using the icpcp compiler in the composer_xe_2015 suite.
I attach the very simple driver I have used below: it has been compiled with the -O3 flag, the operating system is Suse 2011 SP1
and the computer is a single node of a intel cluster with two soket Xeon X5675.
The driver takes nearly 33 seconds to run, if I remove the last two rows
std::set<double> vect;
vect.insert(2.0);
then I get 16 seconds. That seems strange. I have also built the code using gcc and I get 16 seconds: the g++ compiler is 4.3.4.
What am I missing? If I reduce the optimization level I do not get any improvement.
Thanks.
Andrea
#include <cmath>
#include <iostream>
#include <time.h>
#include <vector>
#include <set>
using namespace std;
int main(int argc, char *argv[])
{
//Matmul---------------------------------------------------------------------
static int N = 300;
vector< vector<double> > A(N), B(N), C(N);
time_t start, end;
for(int i=0; i < N; ++i)
{
A[i].resize(N);
B[i].resize(N);
C[i].resize(N);
}
for(int i=0; i < N; ++i)
{
for(int j=0; j < N; ++j)
{
A[i][j] = double(i*j) / (N*N*N);
B[i][j] = double(i*j) / (N*N*N);
}
}
cout << "Start "; time(&start); cout << endl;
for(int z=0; z < 300; ++z)
{
for(int i=0; i < N; ++i)
{
for(int j=0; j < N; ++j)
{
C[i][j] = 0.0;
for(int k=0; k < N; ++k)
{ C[i][j] += A[i][k] * B[k][j]; }
}
}
for(int i=0; i < N; ++i)
{
for(int j=0; j < N; ++j)
{ A[i][j] = C[i][j]; }
}
}
time(&end); cout << "done ("<< difftime(end, start) << " s)"<< endl << endl;
double tot = 0.0;
for(int i=0; i < N; ++i)
{
for(int j=0; j < N; ++j)
{ tot += A[i][j]; }
}
cout << "tot "<< tot << endl;
//Finder
std::set<double> vect;
vect.insert(2.0);
}