Quantcast
Channel: Intel® C++ Compiler
Viewing all articles
Browse latest Browse all 1616

Why doesn't the c++ compiler generate non-temporal moves in this example?

$
0
0

I'm running this benchmark with intel c++ compiler 2016 using these flags "icc -std=c++11 -xHost -O3 main.cpp" 

#include <chrono>
#include <random>
#include <iostream>
#include <algorithm>

using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                               std::chrono::high_resolution_clock,
                               std::chrono::steady_clock>::type;
using Scale = std::chrono::nanoseconds;

auto rng = std::mt19937{0};

using Scalar = double;

std::uniform_real_distribution<Scalar> dist(-1, 1);
const int N = 1000000;

void kernel1(const std::vector<Scalar>& x, std::vector<Scalar>& y) {
#pragma simd
  for (int i = 0; i < N; ++i) y[i] = x[i] * x[i];
}

void kernel2(const Scalar* __restrict x, Scalar* __restrict y) {
#pragma simd
  for (int i = 0; i < N; ++i) y[i] = x[i] * x[i];
}

void kernel3(const Scalar* x, Scalar* y) {
#pragma simd
  for (int i = 0; i < N; ++i) y[i] = x[i] * x[i];
}

void bench() {
  std::vector<Scalar> x(N), y(N), z(N);
  std::generate_n(x.begin(), N, [] { return dist(rng); });

  auto t0 = Clock::now();
  kernel1(x, z);
  auto t1 = Clock::now();
  kernel2(x.data(), y.data());
  auto t2 = Clock::now();
  kernel3(x.data(), y.data());
  auto t3 = Clock::now();

  auto dur1 = std::chrono::duration_cast<Scale>(t1 - t0).count();
  auto dur2 = std::chrono::duration_cast<Scale>(t2 - t1).count();
  auto dur3 = std::chrono::duration_cast<Scale>(t3 - t2).count();

  std::cout << "kernel1: "<< dur1 << "\n";
  std::cout << "kernel2: "<< dur2 << "\n";
  std::cout << "kernel3: "<< dur3 << "\n";
}

int main() {
  bench();
  return 0;
}

 

The results I'm getting are

kernel1: 532663

kernel2: 486459

kernel3: 967921

 

So kernel3 is running approximate twice as slow. Inspecting the assembly, the reason looks to be because kernel3 doesn't use the non-temporal move instruction "movntpd" to set the results back into memory.

 

Can anyone explain to me why it does this? and why does adding "__restrict" to the pointers cause it to use movntpd? And are there any other ways to tell the compiler to use non-temporal moves when working with pointers?


Viewing all articles
Browse latest Browse all 1616

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>