Branchless Programming in C++

Profiling

Google Benchmark https://github.com/google/benchmark
Perf https://perf.wiki.kernel.org/index.php/Main_Page
- perf stat

Summary

For best performance use hardware efficiently
Use all of the hardware all the time (ideal goal)
Processors can do many computations at once every cycle
Limiting factor is usually availability of data
Workaround is pipelining - running multiple instructions streams at once
Limiting factor is conditional code - next instruction is data dependent
Workaround is branch prediction - guess the next instruction and go on
Limiting factor is the ability to guess the future
Workaround is writing unconditional code with data dependencies

References

Branchless Programming in C++ - Fedor Pikus - CppCon 2021
https://perfbench.com/ https://quick-bench.com/q/e8QPdYta3Y2BAN0HUnlezkIOJpk

Code spippets

Group 1
- Sample 1

static void BM_branch_predicted(benchmark::State& state) {
  srand(1);
  const unsigned int N = state.range(0);
  std::vector<unsigned long> v1(N), v2(N);
  std::vector<int> c1(N);
  for(size_t i = 0; i < N; i++) {
    v1[i] = rand();
    v2[i] = rand();
    c1[i] = rand()>=0;
  }
  unsigned long* p1 = v1.data();
  unsigned long* p2 = v2.data();
  int* b1 = c1.data();
  for(auto _ : state) {
    unsigned long a1 = 0, a2 = 0;
    for(size_t i = 0; i < N; i++) {
      if(b1[i]) {
        a1 += p1[i];
      } else {
        a2 *= p2[i];
      }
    }
    benchmark::DoNotOptimize(a1);
    benchmark::DoNotOptimize(a2);
    benchmark::ClobberMemory();
  }
  state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_branch_predicted)->Arg(1<<22);

Sample 2

static void BM_branch_not_predicted(benchmark::State& state) {
  srand(1);
  const unsigned int N = state.range(0);
  std::vector<unsigned long> v1(N), v2(N);
  std::vector<int> c1(N);
  for(size_t i = 0; i < N; i++) {
    v1[i] = rand();
    v2[i] = rand();
    c1[i] = rand()&0x1;
  }
  unsigned long* p1 = v1.data();
  unsigned long* p2 = v2.data();
  int* b1 = c1.data();
  for(auto _ : state) {
    unsigned long a1 = 0, a2 = 0;
    for(size_t i = 0; i < N; i++) {
 if(b1[i]) {
   a1 += p1[i];
 } else {
   a2 *= p2[i];
 }
    }
    benchmark::DoNotOptimize(a1);
    benchmark::DoNotOptimize(a2);
    benchmark::ClobberMemory();
  }
  state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_branch_not_predicted)->Arg(1<<22);

Sample 3

static void BM_branch_predict(benchmark::State& state) {
  srand(1);
  const unsigned int N = state.range(0);
  std::vector<unsigned long> v1(N), v2(N);
  std::vector<int> c1(N);
  for(size_t i = 0; i < N; i++) {
    v1[i] = rand();
    v2[i] = rand();
    if (i==0) c1[i] = rand()>=0; else c1[i] = !c1[i-1];
  }
  unsigned long* p1 = v1.data();
  unsigned long* p2 = v2.data();
  int* b1 = c1.data();
  for(auto _ : state) {
    unsigned long a1 = 0, a2 = 0;
    for(size_t i = 0; i < N; i++) {
 if(b1[i]) {
   a1 += p1[i];
 } else {
   a2 *= p2[i];
 }
    }
    benchmark::DoNotOptimize(a1);
    benchmark::DoNotOptimize(a2);
    benchmark::ClobberMemory();
  }
  state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_branch_predict)->Arg(1<<22);

Group 2

Sample 1

static void BM_false_branch(benchmark::State& state) {
srand(1);
const unsigned int N = state.range(0);
std::vector<unsigned long> v1(N), v2(N);
std::vector<int> c1(N), c2(N);
for(size_t i = 0; i < N; i++) {
  v1[i] = rand();
  v2[i] = rand();
  c1[i] = rand()&0x1;
  c2[i] = !c1[i];
}
unsigned long* p1 = v1.data();
unsigned long* p2 = v2.data();
int* b1 = c1.data();
int* b2 = c2.data();
for(auto _ : state) {
  unsigned long a1 = 0, a2 = 0;
  for(size_t i = 0; i < N; i++) {
 if(b1[i] || b2[i]) {
   a1 += p1[i];
 } else {
   a2 *= p2[i];
 }
  }
  benchmark::DoNotOptimize(a1);
  benchmark::DoNotOptimize(a2);
  benchmark::ClobberMemory();
}
state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_false_branch)->Arg(1<<22);

Sample 2

static void BM_false_branch2(benchmark::State& state) {
srand(1);
const unsigned int N = state.range(0);
std::vector<unsigned long> v1(N), v2(N);
std::vector<int> c1(N), c2(N);
for(size_t i = 0; i < N; i++) {
  v1[i] = rand();
  v2[i] = rand();
  c1[i] = rand()&0x1;
  c2[i] = !c1[i];
}
unsigned long* p1 = v1.data();
unsigned long* p2 = v2.data();
int* b1 = c1.data();
int* b2 = c2.data();
for(auto _ : state) {
  unsigned long a1 = 0, a2 = 0;
  for(size_t i = 0; i < N; i++) {
 if(bool(b1[i]) + bool(b2[i])) {
   a1 += p1[i];
 } else {
   a2 *= p2[i];
 }
  }
  benchmark::DoNotOptimize(a1);
  benchmark::DoNotOptimize(a2);
  benchmark::ClobberMemory();
}
state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_false_branch2)->Arg(1<<22);

Group 3

Sample 1

static void BM_branched(benchmark::State& state) {
srand(1);
const unsigned int N = state.range(0);
std::vector<unsigned long> v1(N), v2(N);
std::vector<int> c1(N);
for(size_t i = 0; i < N; i++) {
  v1[i] = rand();
  v2[i] = rand();
  c1[i] = rand()&0x1;
}
unsigned long* p1 = v1.data();
unsigned long* p2 = v2.data();
int* b1 = c1.data();
for(auto _ : state) {
  unsigned long a1 = 0, a2 = 0;
  for(size_t i = 0; i < N; i++) {
 if(b1[i]) {
   a1 += p1[i];
 } else {
   a2 *= p2[i];
 }
  }
  benchmark::DoNotOptimize(a1);
  benchmark::DoNotOptimize(a2);
  benchmark::ClobberMemory();
}
state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_branched)->Arg(1<<22);

Sample 2

static void BM_branchless(benchmark::State& state) {
srand(1);
const unsigned int N = state.range(0);
std::vector<unsigned long> v1(N), v2(N);
std::vector<int> c1(N);
for(size_t i = 0; i < N; i++) {
  v1[i] = rand();
  v2[i] = rand();
  c1[i] = rand()&0x1;
}
unsigned long* p1 = v1.data();
unsigned long* p2 = v2.data();
int* b1 = c1.data();
for(auto _ : state) {
  unsigned long a1 = 0, a2 = 0;
  for(size_t i = 0; i < N; i++) {
 unsigned long s1[2] = {0, p1[i] - p2[i]};
 unsigned long s2[2] = {p1[i] * p2[i], 0};
 a1 += s1[bool(b1[i])];
 a2 += s2[bool(b1[i])];
  }
  benchmark::DoNotOptimize(a1);
  benchmark::DoNotOptimize(a2);
  benchmark::ClobberMemory();
}
state.SetItemsProcessed(N*state.iterations());
}
// Register the function as a benchmark
BENCHMARK(BM_branchless)->Arg(1<<22);