Last active
January 26, 2026 03:26
-
-
Save nmoinvaz/68124bf334027eb86d3a2497a819e390 to your computer and use it in GitHub Desktop.
Benchmark zlib crc32 tail copy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* benchmark_crc32_tail_copy.cc -- benchmark different copy strategies for CRC32 tail handling | |
| * Copyright (C) 2022 Nathan Moinvaziri | |
| * For conditions of distribution and use, see copyright notice in zlib.h | |
| */ | |
| #include <benchmark/benchmark.h> | |
| #include <cstring> | |
| #include <cstdint> | |
| extern "C" { | |
| # include "zbuild.h" | |
| # include "zutil.h" | |
| # include "crc32.h" | |
| # include "crc32_braid_tbl.h" | |
| } | |
| #ifdef X86_AVX512 | |
| #include <immintrin.h> | |
| #endif | |
| #ifdef _MSC_VER | |
| #define Z_RESTRICT __restrict | |
| #else | |
| #define Z_RESTRICT __restrict__ | |
| #endif | |
| /* CRC macros from crc32_p.h */ | |
| #define CRC_DO1(c, buf, i) c = crc_table[(c ^ buf[i]) & 0xff] ^ (c >> 8) | |
| #define CRC_DO2(c, buf, i) {CRC_DO1(c, buf, i); CRC_DO1(c, buf, i+1);} | |
| #define CRC_DO4(c, buf, i) {CRC_DO2(c, buf, i); CRC_DO2(c, buf, i+2);} | |
| #define CRC_DO8(c, buf, i) {CRC_DO4(c, buf, i); CRC_DO4(c, buf, i+4);} | |
| #define CRC_DO16(c, buf, i) {CRC_DO8(c, buf, i); CRC_DO8(c, buf, i+8);} | |
| #define CRC_DO32(c, buf, i) {CRC_DO16(c, buf, i); CRC_DO16(c, buf, i+16);} | |
| // Strategy 1: Piecemeal copy with 8/4/2/1 (current crc32_copy_small approach) | |
| static uint32_t crc32_copy_piecemeal(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 8) { | |
| memcpy(dst, buf, 8); | |
| dst += 8; | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| if (len & 4) { | |
| memcpy(dst, buf, 4); | |
| dst += 4; | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| memcpy(dst, buf, 2); | |
| dst += 2; | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| *dst = *buf; | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| // Strategy 1b: Piecemeal copy with 16/8/4/2/1 | |
| static uint32_t crc32_copy_piecemeal_16(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 16) { | |
| memcpy(dst, buf, 16); | |
| dst += 16; | |
| CRC_DO16(crc, buf, 0); | |
| buf += 16; | |
| len -= 16; | |
| } | |
| if (len & 8) { | |
| memcpy(dst, buf, 8); | |
| dst += 8; | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| } | |
| if (len & 4) { | |
| memcpy(dst, buf, 4); | |
| dst += 4; | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| memcpy(dst, buf, 2); | |
| dst += 2; | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| *dst = *buf; | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| // Strategy 1c: Piecemeal copy with 32/16/8/4/2/1 | |
| static uint32_t crc32_copy_piecemeal_32(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 32) { | |
| memcpy(dst, buf, 32); | |
| dst += 32; | |
| CRC_DO32(crc, buf, 0); | |
| buf += 32; | |
| len -= 32; | |
| } | |
| if (len & 16) { | |
| memcpy(dst, buf, 16); | |
| dst += 16; | |
| CRC_DO16(crc, buf, 0); | |
| buf += 16; | |
| } | |
| if (len & 8) { | |
| memcpy(dst, buf, 8); | |
| dst += 8; | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| } | |
| if (len & 4) { | |
| memcpy(dst, buf, 4); | |
| dst += 4; | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| memcpy(dst, buf, 2); | |
| dst += 2; | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| *dst = *buf; | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| // Strategy 1d: Piecemeal copy with 32-byte loop, hybrid if + switch for remainder | |
| static uint32_t crc32_copy_piecemeal_32_switch(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 32) { | |
| memcpy(dst, buf, 32); | |
| dst += 32; | |
| CRC_DO32(crc, buf, 0); | |
| buf += 32; | |
| len -= 32; | |
| } | |
| /* Handle 16-byte chunk */ | |
| if (len & 16) { | |
| memcpy(dst, buf, 16); dst += 16; CRC_DO16(crc, buf, 0); buf += 16; | |
| } | |
| /* Handle 8-byte chunk */ | |
| if (len & 8) { | |
| memcpy(dst, buf, 8); dst += 8; CRC_DO8(crc, buf, 0); buf += 8; | |
| } | |
| /* Switch for 0-7 bytes (8 cases) */ | |
| switch (len & 7) { | |
| case 7: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4; | |
| memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2; | |
| *dst = *buf; CRC_DO1(crc, buf, 0); break; | |
| case 6: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4; | |
| memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break; | |
| case 5: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4; | |
| *dst = *buf; CRC_DO1(crc, buf, 0); break; | |
| case 4: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); break; | |
| case 3: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2; | |
| *dst = *buf; CRC_DO1(crc, buf, 0); break; | |
| case 2: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break; | |
| case 1: *dst = *buf; CRC_DO1(crc, buf, 0); break; | |
| case 0: break; | |
| } | |
| return crc; | |
| } | |
| // Strategy 2: Single memcpy then CRC | |
| static uint32_t crc32_copy_memcpy_first(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| memcpy(dst, buf, len); | |
| while (len >= 8) { | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| if (len & 4) { | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| // Strategy 3: Byte loop for both copy and CRC (no restrict) | |
| static uint32_t crc32_copy_byte_loop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len--) { | |
| *dst++ = *buf; | |
| CRC_DO1(crc, buf, 0); | |
| buf++; | |
| } | |
| return crc; | |
| } | |
| // Strategy 3b: Byte loop with restrict (compiler may auto-vectorize copy) | |
| static uint32_t crc32_copy_loop_restrict(uint32_t crc, uint8_t* Z_RESTRICT dst, const uint8_t* Z_RESTRICT buf, size_t len) { | |
| while (len--) { | |
| *dst++ = *buf; | |
| CRC_DO1(crc, buf, 0); | |
| buf++; | |
| } | |
| return crc; | |
| } | |
| #ifdef X86_AVX512 | |
| // Strategy 4: AVX-512 masked copy for remainder, piecemeal CRC | |
| static uint32_t crc32_copy_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 8) { | |
| memcpy(dst, buf, 8); | |
| dst += 8; | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| /* Use AVX-512 masked store for remainder copy (0-7 bytes) */ | |
| if (len > 0) { | |
| __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len); | |
| __m128i chunk = _mm_maskz_loadu_epi8(mask, buf); | |
| _mm_mask_storeu_epi8(dst, mask, chunk); | |
| } | |
| if (len & 4) { | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| // Strategy 5: AVX-512 masked copy, byte loop for CRC remainder | |
| static uint32_t crc32_copy_masked_byteloop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 8) { | |
| memcpy(dst, buf, 8); | |
| dst += 8; | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| /* Use AVX-512 masked store for remainder copy (0-7 bytes) */ | |
| if (len > 0) { | |
| __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len); | |
| __m128i chunk = _mm_maskz_loadu_epi8(mask, buf); | |
| _mm_mask_storeu_epi8(dst, mask, chunk); | |
| } | |
| while (len--) { | |
| CRC_DO1(crc, buf, 0); | |
| buf++; | |
| } | |
| return crc; | |
| } | |
| // Strategy 6: Piecemeal 32 with AVX-512 masked copy for remainder | |
| static uint32_t crc32_copy_piecemeal_32_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) { | |
| while (len >= 32) { | |
| memcpy(dst, buf, 32); | |
| dst += 32; | |
| CRC_DO32(crc, buf, 0); | |
| buf += 32; | |
| len -= 32; | |
| } | |
| /* Use AVX-512 masked store for remainder copy (0-31 bytes) */ | |
| if (len > 0) { | |
| __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned)len); | |
| __m256i chunk = _mm256_maskz_loadu_epi8(mask, buf); | |
| _mm256_mask_storeu_epi8(dst, mask, chunk); | |
| } | |
| if (len & 16) { | |
| CRC_DO16(crc, buf, 0); | |
| buf += 16; | |
| } | |
| if (len & 8) { | |
| CRC_DO8(crc, buf, 0); | |
| buf += 8; | |
| } | |
| if (len & 4) { | |
| CRC_DO4(crc, buf, 0); | |
| buf += 4; | |
| } | |
| if (len & 2) { | |
| CRC_DO2(crc, buf, 0); | |
| buf += 2; | |
| } | |
| if (len & 1) { | |
| CRC_DO1(crc, buf, 0); | |
| } | |
| return crc; | |
| } | |
| #endif | |
| class crc32_tail_copy : public benchmark::Fixture { | |
| private: | |
| uint8_t *srcbuf; | |
| uint8_t *dstbuf; | |
| uint32_t initial_crc; | |
| public: | |
| void SetUp(::benchmark::State& state) { | |
| srcbuf = (uint8_t *)zng_alloc_aligned(128, 64); | |
| dstbuf = (uint8_t *)zng_alloc_aligned(128, 64); | |
| if (srcbuf == NULL || dstbuf == NULL) { | |
| state.SkipWithError("malloc failed"); | |
| return; | |
| } | |
| for (int i = 0; i < 128; i++) { | |
| srcbuf[i] = (uint8_t)(rand() & 0xff); | |
| } | |
| initial_crc = 0xFFFFFFFF; | |
| } | |
| void TearDown(const ::benchmark::State&) { | |
| zng_free_aligned(srcbuf); | |
| zng_free_aligned(dstbuf); | |
| } | |
| void BenchPiecemeal(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_piecemeal(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchPiecemeal16(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_piecemeal_16(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchPiecemeal32(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_piecemeal_32(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchPiecemeal32Switch(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_piecemeal_32_switch(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchMemcpyFirst(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_memcpy_first(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchByteLoop(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_byte_loop(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchLoopRestrict(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_loop_restrict(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| #ifdef X86_AVX512 | |
| void BenchMasked(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_masked(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchMaskedByteloop(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_masked_byteloop(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| void BenchPiecemeal32Masked(benchmark::State& state) { | |
| size_t len = (size_t)state.range(0); | |
| for (auto _ : state) { | |
| uint32_t crc = crc32_copy_piecemeal_32_masked(initial_crc, dstbuf, srcbuf, len); | |
| benchmark::DoNotOptimize(crc); | |
| benchmark::DoNotOptimize(dstbuf); | |
| benchmark::ClobberMemory(); | |
| } | |
| } | |
| #endif | |
| }; | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal)(benchmark::State& state) { | |
| BenchPiecemeal(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_16)(benchmark::State& state) { | |
| BenchPiecemeal16(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_16)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32)(benchmark::State& state) { | |
| BenchPiecemeal32(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_switch)(benchmark::State& state) { | |
| BenchPiecemeal32Switch(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_switch)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, memcpy_first)(benchmark::State& state) { | |
| BenchMemcpyFirst(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, memcpy_first)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, byte_loop)(benchmark::State& state) { | |
| BenchByteLoop(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, byte_loop)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, loop_restrict)(benchmark::State& state) { | |
| BenchLoopRestrict(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, loop_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| #ifdef X86_AVX512 | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, masked)(benchmark::State& state) { | |
| BenchMasked(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, masked_byteloop)(benchmark::State& state) { | |
| BenchMaskedByteloop(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, masked_byteloop)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_masked)(benchmark::State& state) { | |
| BenchPiecemeal32Masked(state); | |
| } | |
| BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63); | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment