Function API profiling

29 Nov 2024

[ c++  performance  design  ]


Compiler flags

Return std::unique_ptr

#include <memory>

std::unique_ptr<int> value_ptr() {
    return nullptr;

void output_ptr(std::unique_ptr<int>& dst) {
    dst = nullptr;

Return std::unique_ptr : call site

#include <memory>

std::unique_ptr<int> value_ptr();
void output_ptr(std::unique_ptr<int>& dst);

int value_ptr_call() {
    auto ptr = value_ptr();
    return *ptr;

int output_ptr_call() {
    std::unique_ptr<int> ptr;
    return *ptr;

Return by value vs output parameter

#include <memory>

std::unique_ptr<int> value_ptr() {
    return nullptr;

void output_ptr(std::unique_ptr<int>& dst) {
    dst = nullptr;

int* value_ptr_call() {
    auto ptr = value_ptr();
    return ptr.get();

int* output_ptr_call() {
    std::unique_ptr<int> ptr;
    return ptr.get();

static void return_by_value(benchmark::State& state) {
  for (auto _ : state) {

static void output_parameter(benchmark::State& state) {
  for (auto _ : state) {

Returning a pointer

#include <memory>

int* raw_ptr() {
    return nullptr;

std::unique_ptr<int> smart_ptr() {
    return nullptr;

Return in register vs return in memory

#include <memory>

int* raw_ptr() {
    return nullptr;

std::unique_ptr<int> smart_ptr() {
    return nullptr;

static void return_in_register(benchmark::State& state) {
  for (auto _ : state) {

static void return_in_memory(benchmark::State& state) {
  for (auto _ : state) {

Wrapper over int

Wrapper over int : no custom copy and move

struct INT {
    int value;

    INT(int value = 0) : value{value} {}
    ~INT() {}

int int_seconds() {
    return 60;

INT INT_seconds() {
    return 60;

Wrapper over int : no custom constructor

struct INT {
    int value = 0;

int int_seconds() {
    return 60;

INT INT_seconds() {
    return INT{60};


#include <chrono>

int64_t int_seconds() {
    return 60;

std::chrono::seconds chrono_seconds() {
    return std::chrono::seconds{60};

static_assert(std::is_same_v<int64_t, std::chrono::seconds::rep>);

std::pair and std::tuple

#include <utility>
#include <tuple>

int ints_sum(int x, int y) {
    return x + y;

int pair_sum(std::pair<int, int> p) {
    return p.first + p.second;

int tuple_sum(std::tuple<int, int> t) {
    return std::get<0>(t) + std::get<1>(t);

// static_assert(std::is_trivially_copy_constructible_v<std::pair<int, int>>);
// static_assert(std::is_trivially_copyable_v<std::pair<int, int>>);

// static_assert(std::is_trivially_move_constructible_v<std::tuple<int, int>>);
// static_assert(std::is_trivially_copy_constructible_v<std::tuple<int, int>>);
// static_assert(std::is_trivially_destructible_v<std::tuple<int, int>>);

RVO : inserting a function result into container

#include <optional>

struct large {
    large& operator=(large&&);
    large(large const&);
    large& operator=(large const&);
large make_large();

std::optional<large> optional_large() {
    return std::optional<large>{make_large()};

Lazy evaluation with ac::lazy

#include <optional>

struct large {
    large& operator=(large&&);
    large(large const&);
    large& operator=(large const&);
large make_large();

template<class Function>
struct lazy {
    operator std::invoke_result_t<Function>() {
         return function();

    Function function;
template<class Function>
lazy(Function&&) -> lazy<Function>;

std::optional<large> lazy_optional_large() {
    return std::optional<large>{lazy{make_large}};

Pass by value vs pass by reference

#include <memory>

bool value_is_zero(int x) {
    return x == 0;

bool ref_is_zero(int const& x) {
    return x == 0;

bool value_is_zero_call() {
    return value_is_zero(1);

bool ref_is_zero_call() {
    return ref_is_zero(1);

static void pass_by_value(benchmark::State& state) {
  for (auto _ : state) {

static void pass_by_reference(benchmark::State& state) {
  for (auto _ : state) {

Int parameter

bool value_is_zero(int x) {
    return x == 0;

bool ref_is_zero(int const& x) {
    return x == 0;

Int parameter : extra function

bool value_is_zero(int x);
bool ref_is_zero(int const& x);

bool value_is_zero_call() {
    return value_is_zero(1);

bool ref_is_zero_call() {
    return ref_is_zero(1);
void some_extra_function();

bool value_extra_function(int x) {
    int const copy = x;
    return copy == x;

bool ref_extra_function(int const& x) {
    int const copy = x;
    return copy == x;

std::mdspan vs raw pointer and sizes

#include <cstddef>

int raw_back(int const* ptr, size_t size) {
    return ptr[size - 1];

template<class T>
struct Span {
    T* ptr;
    size_t size;

int span_back(Span<int const> span) {
    return span.ptr[span.size - 1];


int raw_back2(int const* ptr, size_t width, size_t height) {
    return ptr[width * height - 1];

struct mdspan2 {
    int const* ptr;
    size_t width;
    size_t height;

int mdspan_back2(mdspan2 span) {
    return span.ptr[span.width * span.height - 1];

Empty parameter : tag dispatch

int raw_rand();

struct mt19937 {};
int tagged_rand(mt19937);

int raw_rand_call() {
    return raw_rand();

int tagged_rand_call() {
    return tagged_rand(mt19937{});

Chain of function calls

int sum(int x1, int x2);

int sum_12_3(int x1, int x2, int x3) {
    return sum(sum(x1, x2), x3);
int sum_13_2(int x1, int x2, int x3) {
    return sum(sum(x1, x3), x2);
int sum_23_1(int x1, int x2, int x3) {
    return sum(sum(x2, x3), x1);
int sum_21_3(int x1, int x2, int x3) {
    return sum(sum(x2, x1), x3);

Copy of a byte span: call site

void raw_copy(std::byte* dst, std::byte const* src, size_t size);
void checked_copy(
    std::byte* dst, std::byte const* src, size_t dst_size, size_t src_size
std::array<std::byte, 8> arr;

void raw_copy_call() {
    raw_copy(,, 8);
void checked_copy_call() {
    checked_copy(,, 8, 8);


Most important guidelines to avoid function call overhead
