Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Last active January 26, 2026 04:06
Show Gist options
  • Select an option

  • Save nmoinvaz/2c7a3076049b96b8df4510c455f44720 to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/2c7a3076049b96b8df4510c455f44720 to your computer and use it in GitHub Desktop.
Benchmark count matching bytes
#include <benchmark/benchmark.h>
#include <cstdint>
static inline uint32_t count_matching_bytes_ctzll(uint64_t mask) {
return __builtin_ctzll(mask);
}
static inline uint32_t count_matching_bytes_ctz32(uint64_t mask) {
uint32_t lo = (uint32_t)mask;
if (lo)
return __builtin_ctz(lo);
uint32_t hi = (uint32_t)(mask >> 32);
return 32 + __builtin_ctz(hi);
}
static inline uint32_t count_matching_bytes_debruijn(uint64_t mask) {
static const uint8_t debruijn_ctz64[64] = {
63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27,
61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10,
62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9,
50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11
};
return debruijn_ctz64[((mask & (uint64_t)(-(int64_t)mask)) * 0x045FBAC7992A70DAULL) >> 58];
}
static const uint64_t test_masks[] = {
0x0000000000000001ULL, 0x0000000000000100ULL, 0x0000000000010000ULL,
0x0000000001000000ULL, 0x0000000100000000ULL, 0x0000010000000000ULL,
0x0001000000000000ULL, 0x0100000000000000ULL, 0x00000000FFFFFFFFULL,
0xFFFFFFFF00000000ULL, 0xFFFFFFFFFFFFFFFFULL, 0x8000000000000000ULL,
0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, 0x00000000000000FFULL,
0xFF00000000000000ULL,
};
static const size_t num_masks = sizeof(test_masks) / sizeof(test_masks[0]);
static void BM_ctzll(benchmark::State& state) {
for (auto _ : state)
for (size_t i = 0; i < num_masks; i++)
benchmark::DoNotOptimize(count_matching_bytes_ctzll(test_masks[i]));
}
BENCHMARK(BM_ctzll);
static void BM_ctz32_split(benchmark::State& state) {
for (auto _ : state)
for (size_t i = 0; i < num_masks; i++)
benchmark::DoNotOptimize(count_matching_bytes_ctz32(test_masks[i]));
}
BENCHMARK(BM_ctz32_split);
static void BM_debruijn(benchmark::State& state) {
for (auto _ : state)
for (size_t i = 0; i < num_masks; i++)
benchmark::DoNotOptimize(count_matching_bytes_debruijn(test_masks[i]));
}
BENCHMARK(BM_debruijn);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment