I'm running this benchmark with intel c++ compiler 2016 using these flags "icc -std=c++11 -xHost -O3 main.cpp"
#include <chrono> #include <random> #include <iostream> #include <algorithm> using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type; using Scale = std::chrono::nanoseconds; auto rng = std::mt19937{0}; using Scalar = double; std::uniform_real_distribution<Scalar> dist(-1, 1); const int N = 1000000; void kernel1(const std::vector<Scalar>& x, std::vector<Scalar>& y) { #pragma simd for (int i = 0; i < N; ++i) y[i] = x[i] * x[i]; } void kernel2(const Scalar* __restrict x, Scalar* __restrict y) { #pragma simd for (int i = 0; i < N; ++i) y[i] = x[i] * x[i]; } void kernel3(const Scalar* x, Scalar* y) { #pragma simd for (int i = 0; i < N; ++i) y[i] = x[i] * x[i]; } void bench() { std::vector<Scalar> x(N), y(N), z(N); std::generate_n(x.begin(), N, [] { return dist(rng); }); auto t0 = Clock::now(); kernel1(x, z); auto t1 = Clock::now(); kernel2(x.data(), y.data()); auto t2 = Clock::now(); kernel3(x.data(), y.data()); auto t3 = Clock::now(); auto dur1 = std::chrono::duration_cast<Scale>(t1 - t0).count(); auto dur2 = std::chrono::duration_cast<Scale>(t2 - t1).count(); auto dur3 = std::chrono::duration_cast<Scale>(t3 - t2).count(); std::cout << "kernel1: "<< dur1 << "\n"; std::cout << "kernel2: "<< dur2 << "\n"; std::cout << "kernel3: "<< dur3 << "\n"; } int main() { bench(); return 0; }
The results I'm getting are
kernel1: 532663
kernel2: 486459
kernel3: 967921
So kernel3 is running approximate twice as slow. Inspecting the assembly, the reason looks to be because kernel3 doesn't use the non-temporal move instruction "movntpd" to set the results back into memory.
Can anyone explain to me why it does this? and why does adding "__restrict" to the pointers cause it to use movntpd? And are there any other ways to tell the compiler to use non-temporal moves when working with pointers?