Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Last active January 26, 2026 03:26
Show Gist options
  • Select an option

  • Save nmoinvaz/68124bf334027eb86d3a2497a819e390 to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/68124bf334027eb86d3a2497a819e390 to your computer and use it in GitHub Desktop.
Benchmark zlib crc32 tail copy
/* benchmark_crc32_tail_copy.cc -- benchmark different copy strategies for CRC32 tail handling
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <benchmark/benchmark.h>
#include <cstring>
#include <cstdint>
extern "C" {
# include "zbuild.h"
# include "zutil.h"
# include "crc32.h"
# include "crc32_braid_tbl.h"
}
#ifdef X86_AVX512
#include <immintrin.h>
#endif
#ifdef _MSC_VER
#define Z_RESTRICT __restrict
#else
#define Z_RESTRICT __restrict__
#endif
/* CRC macros from crc32_p.h */
#define CRC_DO1(c, buf, i) c = crc_table[(c ^ buf[i]) & 0xff] ^ (c >> 8)
#define CRC_DO2(c, buf, i) {CRC_DO1(c, buf, i); CRC_DO1(c, buf, i+1);}
#define CRC_DO4(c, buf, i) {CRC_DO2(c, buf, i); CRC_DO2(c, buf, i+2);}
#define CRC_DO8(c, buf, i) {CRC_DO4(c, buf, i); CRC_DO4(c, buf, i+4);}
#define CRC_DO16(c, buf, i) {CRC_DO8(c, buf, i); CRC_DO8(c, buf, i+8);}
#define CRC_DO32(c, buf, i) {CRC_DO16(c, buf, i); CRC_DO16(c, buf, i+16);}
// Strategy 1: Piecemeal copy with 8/4/2/1 (current crc32_copy_small approach)
static uint32_t crc32_copy_piecemeal(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 8) {
memcpy(dst, buf, 8);
dst += 8;
CRC_DO8(crc, buf, 0);
buf += 8;
len -= 8;
}
if (len & 4) {
memcpy(dst, buf, 4);
dst += 4;
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
memcpy(dst, buf, 2);
dst += 2;
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
*dst = *buf;
CRC_DO1(crc, buf, 0);
}
return crc;
}
// Strategy 1b: Piecemeal copy with 16/8/4/2/1
static uint32_t crc32_copy_piecemeal_16(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 16) {
memcpy(dst, buf, 16);
dst += 16;
CRC_DO16(crc, buf, 0);
buf += 16;
len -= 16;
}
if (len & 8) {
memcpy(dst, buf, 8);
dst += 8;
CRC_DO8(crc, buf, 0);
buf += 8;
}
if (len & 4) {
memcpy(dst, buf, 4);
dst += 4;
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
memcpy(dst, buf, 2);
dst += 2;
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
*dst = *buf;
CRC_DO1(crc, buf, 0);
}
return crc;
}
// Strategy 1c: Piecemeal copy with 32/16/8/4/2/1
static uint32_t crc32_copy_piecemeal_32(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 32) {
memcpy(dst, buf, 32);
dst += 32;
CRC_DO32(crc, buf, 0);
buf += 32;
len -= 32;
}
if (len & 16) {
memcpy(dst, buf, 16);
dst += 16;
CRC_DO16(crc, buf, 0);
buf += 16;
}
if (len & 8) {
memcpy(dst, buf, 8);
dst += 8;
CRC_DO8(crc, buf, 0);
buf += 8;
}
if (len & 4) {
memcpy(dst, buf, 4);
dst += 4;
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
memcpy(dst, buf, 2);
dst += 2;
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
*dst = *buf;
CRC_DO1(crc, buf, 0);
}
return crc;
}
// Strategy 1d: Piecemeal copy with 32-byte loop, hybrid if + switch for remainder
static uint32_t crc32_copy_piecemeal_32_switch(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 32) {
memcpy(dst, buf, 32);
dst += 32;
CRC_DO32(crc, buf, 0);
buf += 32;
len -= 32;
}
/* Handle 16-byte chunk */
if (len & 16) {
memcpy(dst, buf, 16); dst += 16; CRC_DO16(crc, buf, 0); buf += 16;
}
/* Handle 8-byte chunk */
if (len & 8) {
memcpy(dst, buf, 8); dst += 8; CRC_DO8(crc, buf, 0); buf += 8;
}
/* Switch for 0-7 bytes (8 cases) */
switch (len & 7) {
case 7: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
*dst = *buf; CRC_DO1(crc, buf, 0); break;
case 6: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
case 5: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
*dst = *buf; CRC_DO1(crc, buf, 0); break;
case 4: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); break;
case 3: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
*dst = *buf; CRC_DO1(crc, buf, 0); break;
case 2: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
case 1: *dst = *buf; CRC_DO1(crc, buf, 0); break;
case 0: break;
}
return crc;
}
// Strategy 2: Single memcpy then CRC
static uint32_t crc32_copy_memcpy_first(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
memcpy(dst, buf, len);
while (len >= 8) {
CRC_DO8(crc, buf, 0);
buf += 8;
len -= 8;
}
if (len & 4) {
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
CRC_DO1(crc, buf, 0);
}
return crc;
}
// Strategy 3: Byte loop for both copy and CRC (no restrict)
static uint32_t crc32_copy_byte_loop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len--) {
*dst++ = *buf;
CRC_DO1(crc, buf, 0);
buf++;
}
return crc;
}
// Strategy 3b: Byte loop with restrict (compiler may auto-vectorize copy)
static uint32_t crc32_copy_loop_restrict(uint32_t crc, uint8_t* Z_RESTRICT dst, const uint8_t* Z_RESTRICT buf, size_t len) {
while (len--) {
*dst++ = *buf;
CRC_DO1(crc, buf, 0);
buf++;
}
return crc;
}
#ifdef X86_AVX512
// Strategy 4: AVX-512 masked copy for remainder, piecemeal CRC
static uint32_t crc32_copy_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 8) {
memcpy(dst, buf, 8);
dst += 8;
CRC_DO8(crc, buf, 0);
buf += 8;
len -= 8;
}
/* Use AVX-512 masked store for remainder copy (0-7 bytes) */
if (len > 0) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
__m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
_mm_mask_storeu_epi8(dst, mask, chunk);
}
if (len & 4) {
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
CRC_DO1(crc, buf, 0);
}
return crc;
}
// Strategy 5: AVX-512 masked copy, byte loop for CRC remainder
static uint32_t crc32_copy_masked_byteloop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 8) {
memcpy(dst, buf, 8);
dst += 8;
CRC_DO8(crc, buf, 0);
buf += 8;
len -= 8;
}
/* Use AVX-512 masked store for remainder copy (0-7 bytes) */
if (len > 0) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
__m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
_mm_mask_storeu_epi8(dst, mask, chunk);
}
while (len--) {
CRC_DO1(crc, buf, 0);
buf++;
}
return crc;
}
// Strategy 6: Piecemeal 32 with AVX-512 masked copy for remainder
static uint32_t crc32_copy_piecemeal_32_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
while (len >= 32) {
memcpy(dst, buf, 32);
dst += 32;
CRC_DO32(crc, buf, 0);
buf += 32;
len -= 32;
}
/* Use AVX-512 masked store for remainder copy (0-31 bytes) */
if (len > 0) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
__m256i chunk = _mm256_maskz_loadu_epi8(mask, buf);
_mm256_mask_storeu_epi8(dst, mask, chunk);
}
if (len & 16) {
CRC_DO16(crc, buf, 0);
buf += 16;
}
if (len & 8) {
CRC_DO8(crc, buf, 0);
buf += 8;
}
if (len & 4) {
CRC_DO4(crc, buf, 0);
buf += 4;
}
if (len & 2) {
CRC_DO2(crc, buf, 0);
buf += 2;
}
if (len & 1) {
CRC_DO1(crc, buf, 0);
}
return crc;
}
#endif
class crc32_tail_copy : public benchmark::Fixture {
private:
uint8_t *srcbuf;
uint8_t *dstbuf;
uint32_t initial_crc;
public:
void SetUp(::benchmark::State& state) {
srcbuf = (uint8_t *)zng_alloc_aligned(128, 64);
dstbuf = (uint8_t *)zng_alloc_aligned(128, 64);
if (srcbuf == NULL || dstbuf == NULL) {
state.SkipWithError("malloc failed");
return;
}
for (int i = 0; i < 128; i++) {
srcbuf[i] = (uint8_t)(rand() & 0xff);
}
initial_crc = 0xFFFFFFFF;
}
void TearDown(const ::benchmark::State&) {
zng_free_aligned(srcbuf);
zng_free_aligned(dstbuf);
}
void BenchPiecemeal(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_piecemeal(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal16(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_piecemeal_16(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal32(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_piecemeal_32(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal32Switch(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_piecemeal_32_switch(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchMemcpyFirst(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_memcpy_first(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchByteLoop(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_byte_loop(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchLoopRestrict(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_loop_restrict(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
#ifdef X86_AVX512
void BenchMasked(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_masked(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchMaskedByteloop(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_masked_byteloop(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
void BenchPiecemeal32Masked(benchmark::State& state) {
size_t len = (size_t)state.range(0);
for (auto _ : state) {
uint32_t crc = crc32_copy_piecemeal_32_masked(initial_crc, dstbuf, srcbuf, len);
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dstbuf);
benchmark::ClobberMemory();
}
}
#endif
};
BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal)(benchmark::State& state) {
BenchPiecemeal(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_16)(benchmark::State& state) {
BenchPiecemeal16(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_16)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32)(benchmark::State& state) {
BenchPiecemeal32(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_switch)(benchmark::State& state) {
BenchPiecemeal32Switch(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_switch)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, memcpy_first)(benchmark::State& state) {
BenchMemcpyFirst(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, memcpy_first)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, byte_loop)(benchmark::State& state) {
BenchByteLoop(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, byte_loop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, loop_restrict)(benchmark::State& state) {
BenchLoopRestrict(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, loop_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
#ifdef X86_AVX512
BENCHMARK_DEFINE_F(crc32_tail_copy, masked)(benchmark::State& state) {
BenchMasked(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, masked_byteloop)(benchmark::State& state) {
BenchMaskedByteloop(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, masked_byteloop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_masked)(benchmark::State& state) {
BenchPiecemeal32Masked(state);
}
BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment