Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Last active January 26, 2026 02:14
Show Gist options
  • Select an option

  • Save nmoinvaz/f354c3c8a372bbf88b53c668f616c540 to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/f354c3c8a372bbf88b53c668f616c540 to your computer and use it in GitHub Desktop.
Benchmark memory tail copying
/* benchmark_tailcopy.cc -- benchmark different copy strategies for tail handling
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <benchmark/benchmark.h>
#include <cstring>
#include <cstdint>
extern "C" {
# include "zbuild.h"
# include "zutil.h"
}
#define Z_RESTRICT __restrict__
// Strategy 1: Byte-by-byte loop with restrict (compiler auto-vectorizes)
static void copy_loop_restrict(uint8_t* Z_RESTRICT dst, const uint8_t* Z_RESTRICT src, size_t len) {
while (len--) {
*dst++ = *src++;
}
}
// Strategy 2: Byte-by-byte loop without restrict
static void copy_loop_no_restrict(uint8_t* dst, const uint8_t* src, size_t len) {
while (len--) {
*dst++ = *src++;
}
}
// Strategy 3: Fixed-size memcpy piecemeal (16/8/4/2/1)
static void copy_piecemeal_16(uint8_t* dst, const uint8_t* src, size_t len) {
while (len >= 16) {
memcpy(dst, src, 16);
dst += 16;
src += 16;
len -= 16;
}
if (len & 8) {
memcpy(dst, src, 8);
dst += 8;
src += 8;
}
if (len & 4) {
memcpy(dst, src, 4);
dst += 4;
src += 4;
}
if (len & 2) {
memcpy(dst, src, 2);
dst += 2;
src += 2;
}
if (len & 1) {
*dst = *src;
}
}
// Strategy 4: Fixed-size memcpy piecemeal (8/4/2/1)
static void copy_piecemeal_8(uint8_t* dst, const uint8_t* src, size_t len) {
while (len >= 8) {
memcpy(dst, src, 8);
dst += 8;
src += 8;
len -= 8;
}
if (len & 4) {
memcpy(dst, src, 4);
dst += 4;
src += 4;
}
if (len & 2) {
memcpy(dst, src, 2);
dst += 2;
src += 2;
}
if (len & 1) {
*dst = *src;
}
}
// Strategy 5: Single memcpy
static void copy_single_memcpy(uint8_t* dst, const uint8_t* src, size_t len) {
memcpy(dst, src, len);
}
class tailcopy : public benchmark::Fixture {
private:
uint8_t *srcbuf;
uint8_t *dstbuf;
public:
void SetUp(::benchmark::State& state) {
srcbuf = (uint8_t *)zng_alloc_aligned(128, 64);
dstbuf = (uint8_t *)zng_alloc_aligned(128, 64);
if (srcbuf == NULL || dstbuf == NULL) {
state.SkipWithError("malloc failed");
return;
}
for (int i = 0; i < 128; i++) {
srcbuf[i] = (uint8_t)(rand() & 0xff);
}
}
void TearDown(const ::benchmark::State&) {
zng_free_aligned(srcbuf);
zng_free_aligned(dstbuf);
}
void BenchLoopRestrict(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
copy_loop_restrict(dstbuf, srcbuf, len);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchLoopNoRestrict(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
copy_loop_no_restrict(dstbuf, srcbuf, len);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal16(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
copy_piecemeal_16(dstbuf, srcbuf, len);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal8(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
copy_piecemeal_8(dstbuf, srcbuf, len);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchMemcpy(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
copy_single_memcpy(dstbuf, srcbuf, len);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
};
BENCHMARK_DEFINE_F(tailcopy, loop_restrict)(benchmark::State& state) {
BenchLoopRestrict(state);
}
BENCHMARK_REGISTER_F(tailcopy, loop_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(tailcopy, loop_no_restrict)(benchmark::State& state) {
BenchLoopNoRestrict(state);
}
BENCHMARK_REGISTER_F(tailcopy, loop_no_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(tailcopy, piecemeal_16)(benchmark::State& state) {
BenchPiecemeal16(state);
}
BENCHMARK_REGISTER_F(tailcopy, piecemeal_16)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(tailcopy, piecemeal_8)(benchmark::State& state) {
BenchPiecemeal8(state);
}
BENCHMARK_REGISTER_F(tailcopy, piecemeal_8)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(tailcopy, memcpy)(benchmark::State& state) {
BenchMemcpy(state);
}
BENCHMARK_REGISTER_F(tailcopy, memcpy)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment