nmoinvaz · January 26, 2026 03:26
diff --git a/benchmark_crc32_tail_copy.cc b/benchmark_crc32_tail_copy.cc
 /* benchmark_crc32_tail_copy.cc -- benchmark different copy strategies for CRC32 tail handling
 * Copyright (C) 2022 Nathan Moinvaziri
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

 #include <benchmark/benchmark.h>
 #include <cstring>
 #include <cstdint>

 extern "C" {
 #  include "zbuild.h"
 #  include "zutil.h"
 #  include "crc32.h"
 #  include "crc32_braid_tbl.h"
 }

 #ifdef X86_AVX512
 #include <immintrin.h>
 #endif

 #ifdef _MSC_VER
 #define Z_RESTRICT __restrict
 #else
 #define Z_RESTRICT __restrict__
 #endif

 /* CRC macros from crc32_p.h */
 #define CRC_DO1(c, buf, i) c = crc_table[(c ^ buf[i]) & 0xff] ^ (c >> 8)
 #define CRC_DO2(c, buf, i) {CRC_DO1(c, buf, i); CRC_DO1(c, buf, i+1);}
 #define CRC_DO4(c, buf, i) {CRC_DO2(c, buf, i); CRC_DO2(c, buf, i+2);}
 #define CRC_DO8(c, buf, i) {CRC_DO4(c, buf, i); CRC_DO4(c, buf, i+4);}
 #define CRC_DO16(c, buf, i) {CRC_DO8(c, buf, i); CRC_DO8(c, buf, i+8);}
 #define CRC_DO32(c, buf, i) {CRC_DO16(c, buf, i); CRC_DO16(c, buf, i+16);}

 // Strategy 1: Piecemeal copy with 8/4/2/1 (current crc32_copy_small approach)
 static uint32_t crc32_copy_piecemeal(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 8) {
        memcpy(dst, buf, 8);
        dst += 8;
        CRC_DO8(crc, buf, 0);
        buf += 8;
        len -= 8;
    }
    if (len & 4) {
        memcpy(dst, buf, 4);
        dst += 4;
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        memcpy(dst, buf, 2);
        dst += 2;
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        *dst = *buf;
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }

 // Strategy 1b: Piecemeal copy with 16/8/4/2/1
 static uint32_t crc32_copy_piecemeal_16(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 16) {
        memcpy(dst, buf, 16);
        dst += 16;
        CRC_DO16(crc, buf, 0);
        buf += 16;
        len -= 16;
    }
    if (len & 8) {
        memcpy(dst, buf, 8);
        dst += 8;
        CRC_DO8(crc, buf, 0);
        buf += 8;
    }
    if (len & 4) {
        memcpy(dst, buf, 4);
        dst += 4;
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        memcpy(dst, buf, 2);
        dst += 2;
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        *dst = *buf;
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }

 // Strategy 1c: Piecemeal copy with 32/16/8/4/2/1
 static uint32_t crc32_copy_piecemeal_32(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 32) {
        memcpy(dst, buf, 32);
        dst += 32;
        CRC_DO32(crc, buf, 0);
        buf += 32;
        len -= 32;
    }
    if (len & 16) {
        memcpy(dst, buf, 16);
        dst += 16;
        CRC_DO16(crc, buf, 0);
        buf += 16;
    }
    if (len & 8) {
        memcpy(dst, buf, 8);
        dst += 8;
        CRC_DO8(crc, buf, 0);
        buf += 8;
    }
    if (len & 4) {
        memcpy(dst, buf, 4);
        dst += 4;
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        memcpy(dst, buf, 2);
        dst += 2;
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        *dst = *buf;
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }

 // Strategy 1d: Piecemeal copy with 32-byte loop, hybrid if + switch for remainder
 static uint32_t crc32_copy_piecemeal_32_switch(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 32) {
        memcpy(dst, buf, 32);
        dst += 32;
        CRC_DO32(crc, buf, 0);
        buf += 32;
        len -= 32;
    }
    /* Handle 16-byte chunk */
    if (len & 16) {
        memcpy(dst, buf, 16); dst += 16; CRC_DO16(crc, buf, 0); buf += 16;
    }
    /* Handle 8-byte chunk */
    if (len & 8) {
        memcpy(dst, buf, 8); dst += 8; CRC_DO8(crc, buf, 0); buf += 8;
    }
    /* Switch for 0-7 bytes (8 cases) */
    switch (len & 7) {
    case 7: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
            memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
            *dst = *buf; CRC_DO1(crc, buf, 0); break;
    case 6: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
            memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
    case 5: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
            *dst = *buf; CRC_DO1(crc, buf, 0); break;
    case 4: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); break;
    case 3: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
            *dst = *buf; CRC_DO1(crc, buf, 0); break;
    case 2: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
    case 1: *dst = *buf; CRC_DO1(crc, buf, 0); break;
    case 0: break;
    }
    return crc;
 }

 // Strategy 2: Single memcpy then CRC
 static uint32_t crc32_copy_memcpy_first(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    memcpy(dst, buf, len);
    while (len >= 8) {
        CRC_DO8(crc, buf, 0);
        buf += 8;
        len -= 8;
    }
    if (len & 4) {
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }

 // Strategy 3: Byte loop for both copy and CRC (no restrict)
 static uint32_t crc32_copy_byte_loop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len--) {
        *dst++ = *buf;
        CRC_DO1(crc, buf, 0);
        buf++;
    }
    return crc;
 }

 // Strategy 3b: Byte loop with restrict (compiler may auto-vectorize copy)
 static uint32_t crc32_copy_loop_restrict(uint32_t crc, uint8_t* Z_RESTRICT dst, const uint8_t* Z_RESTRICT buf, size_t len) {
    while (len--) {
        *dst++ = *buf;
        CRC_DO1(crc, buf, 0);
        buf++;
    }
    return crc;
 }

 #ifdef X86_AVX512
 // Strategy 4: AVX-512 masked copy for remainder, piecemeal CRC
 static uint32_t crc32_copy_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 8) {
        memcpy(dst, buf, 8);
        dst += 8;
        CRC_DO8(crc, buf, 0);
        buf += 8;
        len -= 8;
    }
    /* Use AVX-512 masked store for remainder copy (0-7 bytes) */
    if (len > 0) {
        __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
        __m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
        _mm_mask_storeu_epi8(dst, mask, chunk);
    }
    if (len & 4) {
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }

 // Strategy 5: AVX-512 masked copy, byte loop for CRC remainder
 static uint32_t crc32_copy_masked_byteloop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 8) {
        memcpy(dst, buf, 8);
        dst += 8;
        CRC_DO8(crc, buf, 0);
        buf += 8;
        len -= 8;
    }
    /* Use AVX-512 masked store for remainder copy (0-7 bytes) */
    if (len > 0) {
        __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
        __m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
        _mm_mask_storeu_epi8(dst, mask, chunk);
    }
    while (len--) {
        CRC_DO1(crc, buf, 0);
        buf++;
    }
    return crc;
 }

 // Strategy 6: Piecemeal 32 with AVX-512 masked copy for remainder
 static uint32_t crc32_copy_piecemeal_32_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
    while (len >= 32) {
        memcpy(dst, buf, 32);
        dst += 32;
        CRC_DO32(crc, buf, 0);
        buf += 32;
        len -= 32;
    }
    /* Use AVX-512 masked store for remainder copy (0-31 bytes) */
    if (len > 0) {
        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
        __m256i chunk = _mm256_maskz_loadu_epi8(mask, buf);
        _mm256_mask_storeu_epi8(dst, mask, chunk);
    }
    if (len & 16) {
        CRC_DO16(crc, buf, 0);
        buf += 16;
    }
    if (len & 8) {
        CRC_DO8(crc, buf, 0);
        buf += 8;
    }
    if (len & 4) {
        CRC_DO4(crc, buf, 0);
        buf += 4;
    }
    if (len & 2) {
        CRC_DO2(crc, buf, 0);
        buf += 2;
    }
    if (len & 1) {
        CRC_DO1(crc, buf, 0);
    }
    return crc;
 }
 #endif

 class crc32_tail_copy : public benchmark::Fixture {
 private:
    uint8_t *srcbuf;
    uint8_t *dstbuf;
    uint32_t initial_crc;

 public:
    void SetUp(::benchmark::State& state) {
        srcbuf = (uint8_t *)zng_alloc_aligned(128, 64);
        dstbuf = (uint8_t *)zng_alloc_aligned(128, 64);
        if (srcbuf == NULL || dstbuf == NULL) {
            state.SkipWithError("malloc failed");
            return;
        }
        for (int i = 0; i < 128; i++) {
            srcbuf[i] = (uint8_t)(rand() & 0xff);
        }
        initial_crc = 0xFFFFFFFF;
    }

    void TearDown(const ::benchmark::State&) {
        zng_free_aligned(srcbuf);
        zng_free_aligned(dstbuf);
    }

    void BenchPiecemeal(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_piecemeal(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchPiecemeal16(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_piecemeal_16(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchPiecemeal32(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_piecemeal_32(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchPiecemeal32Switch(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_piecemeal_32_switch(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchMemcpyFirst(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_memcpy_first(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchByteLoop(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_byte_loop(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchLoopRestrict(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_loop_restrict(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

 #ifdef X86_AVX512
    void BenchMasked(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_masked(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchMaskedByteloop(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_masked_byteloop(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }

    void BenchPiecemeal32Masked(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        for (auto _ : state) {
            uint32_t crc = crc32_copy_piecemeal_32_masked(initial_crc, dstbuf, srcbuf, len);
            benchmark::DoNotOptimize(crc);
            benchmark::DoNotOptimize(dstbuf);
            benchmark::ClobberMemory();
        }
    }
 #endif
 };

 BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal)(benchmark::State& state) {
    BenchPiecemeal(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_16)(benchmark::State& state) {
    BenchPiecemeal16(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_16)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32)(benchmark::State& state) {
    BenchPiecemeal32(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_switch)(benchmark::State& state) {
    BenchPiecemeal32Switch(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_switch)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, memcpy_first)(benchmark::State& state) {
    BenchMemcpyFirst(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, memcpy_first)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, byte_loop)(benchmark::State& state) {
    BenchByteLoop(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, byte_loop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, loop_restrict)(benchmark::State& state) {
    BenchLoopRestrict(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, loop_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 #ifdef X86_AVX512
 BENCHMARK_DEFINE_F(crc32_tail_copy, masked)(benchmark::State& state) {
    BenchMasked(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, masked_byteloop)(benchmark::State& state) {
    BenchMaskedByteloop(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, masked_byteloop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

 BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_masked)(benchmark::State& state) {
    BenchPiecemeal32Masked(state);
 }
 BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
 #endif
	/* benchmark_crc32_tail_copy.cc -- benchmark different copy strategies for CRC32 tail handling
	* Copyright (C) 2022 Nathan Moinvaziri
	* For conditions of distribution and use, see copyright notice in zlib.h
	*/

	#include <benchmark/benchmark.h>
	#include <cstring>
	#include <cstdint>

	extern "C" {
	# include "zbuild.h"
	# include "zutil.h"
	# include "crc32.h"
	# include "crc32_braid_tbl.h"
	}

	#ifdef X86_AVX512
	#include <immintrin.h>
	#endif

	#ifdef _MSC_VER
	#define Z_RESTRICT __restrict
	#else
	#define Z_RESTRICT __restrict__
	#endif

	/* CRC macros from crc32_p.h */
	#define CRC_DO1(c, buf, i) c = crc_table[(c ^ buf[i]) & 0xff] ^ (c >> 8)
	#define CRC_DO2(c, buf, i) {CRC_DO1(c, buf, i); CRC_DO1(c, buf, i+1);}
	#define CRC_DO4(c, buf, i) {CRC_DO2(c, buf, i); CRC_DO2(c, buf, i+2);}
	#define CRC_DO8(c, buf, i) {CRC_DO4(c, buf, i); CRC_DO4(c, buf, i+4);}
	#define CRC_DO16(c, buf, i) {CRC_DO8(c, buf, i); CRC_DO8(c, buf, i+8);}
	#define CRC_DO32(c, buf, i) {CRC_DO16(c, buf, i); CRC_DO16(c, buf, i+16);}

	// Strategy 1: Piecemeal copy with 8/4/2/1 (current crc32_copy_small approach)
	static uint32_t crc32_copy_piecemeal(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 8) {
	memcpy(dst, buf, 8);
	dst += 8;
	CRC_DO8(crc, buf, 0);
	buf += 8;
	len -= 8;
	}
	if (len & 4) {
	memcpy(dst, buf, 4);
	dst += 4;
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	memcpy(dst, buf, 2);
	dst += 2;
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	dst = buf;
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}

	// Strategy 1b: Piecemeal copy with 16/8/4/2/1
	static uint32_t crc32_copy_piecemeal_16(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 16) {
	memcpy(dst, buf, 16);
	dst += 16;
	CRC_DO16(crc, buf, 0);
	buf += 16;
	len -= 16;
	}
	if (len & 8) {
	memcpy(dst, buf, 8);
	dst += 8;
	CRC_DO8(crc, buf, 0);
	buf += 8;
	}
	if (len & 4) {
	memcpy(dst, buf, 4);
	dst += 4;
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	memcpy(dst, buf, 2);
	dst += 2;
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	dst = buf;
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}

	// Strategy 1c: Piecemeal copy with 32/16/8/4/2/1
	static uint32_t crc32_copy_piecemeal_32(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 32) {
	memcpy(dst, buf, 32);
	dst += 32;
	CRC_DO32(crc, buf, 0);
	buf += 32;
	len -= 32;
	}
	if (len & 16) {
	memcpy(dst, buf, 16);
	dst += 16;
	CRC_DO16(crc, buf, 0);
	buf += 16;
	}
	if (len & 8) {
	memcpy(dst, buf, 8);
	dst += 8;
	CRC_DO8(crc, buf, 0);
	buf += 8;
	}
	if (len & 4) {
	memcpy(dst, buf, 4);
	dst += 4;
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	memcpy(dst, buf, 2);
	dst += 2;
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	dst = buf;
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}

	// Strategy 1d: Piecemeal copy with 32-byte loop, hybrid if + switch for remainder
	static uint32_t crc32_copy_piecemeal_32_switch(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 32) {
	memcpy(dst, buf, 32);
	dst += 32;
	CRC_DO32(crc, buf, 0);
	buf += 32;
	len -= 32;
	}
	/* Handle 16-byte chunk */
	if (len & 16) {
	memcpy(dst, buf, 16); dst += 16; CRC_DO16(crc, buf, 0); buf += 16;
	}
	/* Handle 8-byte chunk */
	if (len & 8) {
	memcpy(dst, buf, 8); dst += 8; CRC_DO8(crc, buf, 0); buf += 8;
	}
	/* Switch for 0-7 bytes (8 cases) */
	switch (len & 7) {
	case 7: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
	memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
	dst = buf; CRC_DO1(crc, buf, 0); break;
	case 6: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
	memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
	case 5: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); buf += 4;
	dst = buf; CRC_DO1(crc, buf, 0); break;
	case 4: memcpy(dst, buf, 4); dst += 4; CRC_DO4(crc, buf, 0); break;
	case 3: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); buf += 2;
	dst = buf; CRC_DO1(crc, buf, 0); break;
	case 2: memcpy(dst, buf, 2); dst += 2; CRC_DO2(crc, buf, 0); break;
	case 1: dst = buf; CRC_DO1(crc, buf, 0); break;
	case 0: break;
	}
	return crc;
	}

	// Strategy 2: Single memcpy then CRC
	static uint32_t crc32_copy_memcpy_first(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	memcpy(dst, buf, len);
	while (len >= 8) {
	CRC_DO8(crc, buf, 0);
	buf += 8;
	len -= 8;
	}
	if (len & 4) {
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}

	// Strategy 3: Byte loop for both copy and CRC (no restrict)
	static uint32_t crc32_copy_byte_loop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len--) {
	dst++ = buf;
	CRC_DO1(crc, buf, 0);
	buf++;
	}
	return crc;
	}

	// Strategy 3b: Byte loop with restrict (compiler may auto-vectorize copy)
	static uint32_t crc32_copy_loop_restrict(uint32_t crc, uint8_t* Z_RESTRICT dst, const uint8_t* Z_RESTRICT buf, size_t len) {
	while (len--) {
	dst++ = buf;
	CRC_DO1(crc, buf, 0);
	buf++;
	}
	return crc;
	}

	#ifdef X86_AVX512
	// Strategy 4: AVX-512 masked copy for remainder, piecemeal CRC
	static uint32_t crc32_copy_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 8) {
	memcpy(dst, buf, 8);
	dst += 8;
	CRC_DO8(crc, buf, 0);
	buf += 8;
	len -= 8;
	}
	/* Use AVX-512 masked store for remainder copy (0-7 bytes) */
	if (len > 0) {
	__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
	__m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
	_mm_mask_storeu_epi8(dst, mask, chunk);
	}
	if (len & 4) {
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}

	// Strategy 5: AVX-512 masked copy, byte loop for CRC remainder
	static uint32_t crc32_copy_masked_byteloop(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 8) {
	memcpy(dst, buf, 8);
	dst += 8;
	CRC_DO8(crc, buf, 0);
	buf += 8;
	len -= 8;
	}
	/* Use AVX-512 masked store for remainder copy (0-7 bytes) */
	if (len > 0) {
	__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, (unsigned)len);
	__m128i chunk = _mm_maskz_loadu_epi8(mask, buf);
	_mm_mask_storeu_epi8(dst, mask, chunk);
	}
	while (len--) {
	CRC_DO1(crc, buf, 0);
	buf++;
	}
	return crc;
	}

	// Strategy 6: Piecemeal 32 with AVX-512 masked copy for remainder
	static uint32_t crc32_copy_piecemeal_32_masked(uint32_t crc, uint8_t* dst, const uint8_t* buf, size_t len) {
	while (len >= 32) {
	memcpy(dst, buf, 32);
	dst += 32;
	CRC_DO32(crc, buf, 0);
	buf += 32;
	len -= 32;
	}
	/* Use AVX-512 masked store for remainder copy (0-31 bytes) */
	if (len > 0) {
	__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
	__m256i chunk = _mm256_maskz_loadu_epi8(mask, buf);
	_mm256_mask_storeu_epi8(dst, mask, chunk);
	}
	if (len & 16) {
	CRC_DO16(crc, buf, 0);
	buf += 16;
	}
	if (len & 8) {
	CRC_DO8(crc, buf, 0);
	buf += 8;
	}
	if (len & 4) {
	CRC_DO4(crc, buf, 0);
	buf += 4;
	}
	if (len & 2) {
	CRC_DO2(crc, buf, 0);
	buf += 2;
	}
	if (len & 1) {
	CRC_DO1(crc, buf, 0);
	}
	return crc;
	}
	#endif

	class crc32_tail_copy : public benchmark::Fixture {
	private:
	uint8_t *srcbuf;
	uint8_t *dstbuf;
	uint32_t initial_crc;

	public:
	void SetUp(::benchmark::State& state) {
	srcbuf = (uint8_t *)zng_alloc_aligned(128, 64);
	dstbuf = (uint8_t *)zng_alloc_aligned(128, 64);
	if (srcbuf == NULL \|\| dstbuf == NULL) {
	state.SkipWithError("malloc failed");
	return;
	}
	for (int i = 0; i < 128; i++) {
	srcbuf[i] = (uint8_t)(rand() & 0xff);
	}
	initial_crc = 0xFFFFFFFF;
	}

	void TearDown(const ::benchmark::State&) {
	zng_free_aligned(srcbuf);
	zng_free_aligned(dstbuf);
	}

	void BenchPiecemeal(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_piecemeal(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchPiecemeal16(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_piecemeal_16(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchPiecemeal32(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_piecemeal_32(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchPiecemeal32Switch(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_piecemeal_32_switch(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchMemcpyFirst(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_memcpy_first(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchByteLoop(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_byte_loop(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchLoopRestrict(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_loop_restrict(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	#ifdef X86_AVX512
	void BenchMasked(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_masked(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchMaskedByteloop(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_masked_byteloop(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}

	void BenchPiecemeal32Masked(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	for (auto _ : state) {
	uint32_t crc = crc32_copy_piecemeal_32_masked(initial_crc, dstbuf, srcbuf, len);
	benchmark::DoNotOptimize(crc);
	benchmark::DoNotOptimize(dstbuf);
	benchmark::ClobberMemory();
	}
	}
	#endif
	};

	BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal)(benchmark::State& state) {
	BenchPiecemeal(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_16)(benchmark::State& state) {
	BenchPiecemeal16(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_16)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32)(benchmark::State& state) {
	BenchPiecemeal32(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_switch)(benchmark::State& state) {
	BenchPiecemeal32Switch(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_switch)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, memcpy_first)(benchmark::State& state) {
	BenchMemcpyFirst(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, memcpy_first)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, byte_loop)(benchmark::State& state) {
	BenchByteLoop(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, byte_loop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, loop_restrict)(benchmark::State& state) {
	BenchLoopRestrict(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, loop_restrict)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	#ifdef X86_AVX512
	BENCHMARK_DEFINE_F(crc32_tail_copy, masked)(benchmark::State& state) {
	BenchMasked(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, masked_byteloop)(benchmark::State& state) {
	BenchMaskedByteloop(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, masked_byteloop)->Arg(7)->Arg(15)->Arg(31)->Arg(63);

	BENCHMARK_DEFINE_F(crc32_tail_copy, piecemeal_32_masked)(benchmark::State& state) {
	BenchPiecemeal32Masked(state);
	}
	BENCHMARK_REGISTER_F(crc32_tail_copy, piecemeal_32_masked)->Arg(7)->Arg(15)->Arg(31)->Arg(63);
	#endif
No results found