Skip to content

Instantly share code, notes, and snippets.

@WiwilZ
Last active April 16, 2024 10:45
Show Gist options
  • Select an option

  • Save WiwilZ/667c08ae4373508cf4a016dd1e027310 to your computer and use it in GitHub Desktop.

Select an option

Save WiwilZ/667c08ae4373508cf4a016dd1e027310 to your computer and use it in GitHub Desktop.
Crc32 function optimized with SIMD Instrction
/******************generate bit-reflected constants********************
constexpr uint64_t Polynomial = 0x1db710641 ; // bit reverse for significant bits of 0x104c11db7
template <bool GetQuotient = false>
constexpr uint64_t Generate(uint64_t deg) {
uint64_t quot = 0;
uint64_t crc = 2;
for (size_t i = 0; i < deg - 31; i++) {
crc >>= 1;
if (crc & 1) {
crc ^= Polynomial;
quot |= 1ull << i;
}
}
return GetQuotient ? quot : crc;
}
int main(int argc, char** argv) {
// for avx512
for (auto i : {4 * 512 + 32, 4 * 512 - 32, 512 + 32, 512 - 32}) {
std::cout << std::format("0x{:09x}\n", Generate(i));
}
std::cout << "\n\n";
//for sse
for (auto i : {4 * 128 + 32, 4 * 128 - 32, 128 + 32, 128 - 32}) {
std::cout << std::format("0x{:09x}\n", Generate(i));
}
std::cout << "\n\n";
// for commom
std::cout << std::format("0x{:09x}\n", Generate(64))
<< std::format("0x{:09x}\n", Generate(32))
<< std::format("0x{:09x}\n", Generate<true>(64));
}
*/
#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC))
#define __SSE4_2__
#define __PCLMUL__
#endif
#include <cstddef>
#include <cstdint>
#if defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__)
# if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
# else
#include <smmintrin.h>
#include <wmmintrin.h>
#ifdef __AVX512F__
#include <immintrin.h>
#endif
# endif
namespace detail {
__m128i FoldTo128Bits(__m128i& x1, const __m128i x2, const __m128i x3, const __m128i x4) noexcept {
const __m128i x0 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0);
__m128i x5;
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, x2, x5, 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, x2), x5);
#endif
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, x3, x5, 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, x3), x5);
#endif
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, x4, x5, 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, x4), x5);
#endif
return x0;
}
uint32_t Fold128BitsToResult(__m128i x0, __m128i x1) noexcept {
__m128i x2, x3;
// Fold 128-bits to 64-bits
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(-1, 0, -1, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_cvtsi64_si128(0x163cd6124);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
// Barret reduce to 32-bits
x0 = _mm_set_epi64x(0x1f7011641, 0x1db710641);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
return _mm_extract_epi32(x1, 1);
}
template <size_t NumBytes>
uint32_t Crc32(const void* data, size_t length, uint32_t crc = 0) noexcept;
#ifdef __AVX512F__
template <>
uint32_t Crc32<256>(const void* data, size_t length, uint32_t crc) noexcept {
auto buffer = static_cast<const __m512i*>(data);
// There's at least one block of 256
__m512i x1 = _mm512_xor_si512(_mm512_loadu_si512(buffer + 0), _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
__m512i x2 = _mm512_loadu_si512(buffer + 1);
__m512i x3 = _mm512_loadu_si512(buffer + 2);
__m512i x4 = _mm512_loadu_si512(buffer + 3);
buffer += 4;
length -= 256;
__m512i x0 = _mm512_set_epi64(0x1322d1430, 0x11542778a, 0x1322d1430, 0x11542778a,
0x1322d1430, 0x11542778a, 0x1322d1430, 0x11542778a);
__m512i x5, x6, x7, x8;
// Parallel fold blocks of 256
while (length >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_loadu_si512(buffer + 0), 0x96);
x2 = _mm512_ternarylogic_epi64(x2, x6, _mm512_loadu_si512(buffer + 1), 0x96);
x3 = _mm512_ternarylogic_epi64(x3, x7, _mm512_loadu_si512(buffer + 2), 0x96);
x4 = _mm512_ternarylogic_epi64(x4, x8, _mm512_loadu_si512(buffer + 3), 0x96);
buffer += 4;
length -= 256;
}
// Fold into 512-bits
x0 = _mm512_set_epi64(0x1c6e41596, 0x154442bd4, 0x1c6e41596, 0x154442bd4,
0x1c6e41596, 0x154442bd4, 0x1c6e41596, 0x154442bd4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x3, x5, 0x96);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x4, x5, 0x96);
// Single fold blocks of 64
while (length >= 64) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, _mm512_loadu_si512(buffer), x5, 0x96);
++buffer;
length -= 64;
}
// Fold 512-bits to 128-bits
__m128i a1 = _mm512_extracti32x4_epi32(x1, 0);
const __m128i a2 = _mm512_extracti32x4_epi32(x1, 1);
const __m128i a3 = _mm512_extracti32x4_epi32(x1, 2);
const __m128i a4 = _mm512_extracti32x4_epi32(x1, 3);
const __m128i a0 = FoldTo128Bits(a1, a2, a3, a4);
return Fold128BitsToResult(a0, a1);
}
#endif // defined(__AVX512F__)
template <>
uint32_t Crc32<64>(const void* data, size_t length, uint32_t crc) noexcept {
auto buffer = static_cast<const __m128i*>(data);
// There's at least one block of 64
__m128i x1 = _mm_xor_si128(_mm_loadu_si128(buffer + 0), _mm_cvtsi32_si128(crc));
__m128i x2 = _mm_loadu_si128(buffer + 1);
__m128i x3 = _mm_loadu_si128(buffer + 2);
__m128i x4 = _mm_loadu_si128(buffer + 3);
buffer += 4;
length -= 64;
__m128i x0 = _mm_set_epi64x(0x1c6e41596, 0x154442bd4);
__m128i x5, x6, x7, x8;
// Parallel fold blocks of 64
while (length >= 64) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, x5, _mm_loadu_si128(buffer + 0), 0x96);
x2 = _mm_ternarylogic_epi64(x2, x6, _mm_loadu_si128(buffer + 1), 0x96);
x3 = _mm_ternarylogic_epi64(x3, x7, _mm_loadu_si128(buffer + 2), 0x96);
x4 = _mm_ternarylogic_epi64(x4, x8, _mm_loadu_si128(buffer + 3), 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, x5), _mm_loadu_si128(buffer + 0));
x2 = _mm_xor_si128(_mm_xor_si128(x2, x6), _mm_loadu_si128(buffer + 1));
x3 = _mm_xor_si128(_mm_xor_si128(x3, x7), _mm_loadu_si128(buffer + 2));
x4 = _mm_xor_si128(_mm_xor_si128(x4, x8), _mm_loadu_si128(buffer + 3));
#endif
buffer += 4;
length -= 64;
}
// Fold into 128-bits
x0 = FoldTo128Bits(x1, x2, x3, x4);
// Single fold blocks of 16
while (length >= 16) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, _mm_loadu_si128(buffer), x5, 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, _mm_loadu_si128(buffer)), x5);
#endif
++buffer;
length -= 16;
}
return Fold128BitsToResult(x0, x1);
}
template <>
uint32_t Crc32<16>(const void* data, size_t length, uint32_t crc) noexcept {
auto buffer = static_cast<const __m128i*>(data);
// There's at least one block of 16
__m128i x1 = _mm_xor_si128(_mm_loadu_si128(buffer++), _mm_cvtsi32_si128(crc));
const __m128i x0 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0);
// Single fold blocks of 16
for (length -= 16; length >= 16; length -= 16) {
const __m128i tmp = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
#ifdef __AVX512VL__
x1 = _mm_ternarylogic_epi64(x1, _mm_loadu_si128(buffer++), tmp, 0x96);
#else
x1 = _mm_xor_si128(_mm_xor_si128(x1, _mm_loadu_si128(buffer)), tmp);
#endif
}
return Fold128BitsToResult(x0, x1);
}
} // namespace detail
#endif // defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__)
#include <array>
namespace detail {
constexpr auto Crc32Table = [] {
constexpr uint32_t Polynomial = 0xEDB88320;
std::array<std::array<uint32_t, 256>, 8> table{};
for (uint32_t i = 0; i < table[0].size(); i++) {
uint32_t crc = i;
for (uint32_t j = 0; j < 8; j++) {
crc = (crc >> 1) ^ (-(crc & 1) & Polynomial);
}
table[0][i] = crc;
}
for (uint32_t i = 0; i < table[0].size(); i++) {
uint32_t crc = table[0][i];
for (uint32_t j = 1; j < table.size(); j++) {
crc = (crc >> 8) ^ table[0][crc & 0xFF];
table[j][i] = crc;
}
}
return table;
}();
}
uint32_t Crc32(const void* const data, size_t length, uint32_t crc = 0) noexcept {
auto buffer = static_cast<const uint8_t*>(data);
crc = ~crc;
#if defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__)
#ifdef __AVX512F__
if (length >= 256) {
const size_t chunkSize = length & ~63;
crc = detail::Crc32<256>(data, chunkSize, crc);
length &= 63;
if (length == 0) {
return ~crc;
}
buffer += chunkSize;
}
#endif
if (length >= 64) {
const size_t chunkSize = length & ~15;
crc = detail::Crc32<64>(data, chunkSize, crc);
length &= 15;
if (length == 0) {
return ~crc;
}
buffer += chunkSize;
}
if (length >= 16) {
const size_t chunkSize = length & ~15;
crc = detail::Crc32<16>(data, chunkSize, crc);
length &= 15;
if (length == 0) {
return ~crc;
}
buffer += chunkSize;
}
if (length & 8) {
const uint64_t x = *reinterpret_cast<const uint64_t*>(buffer) ^ crc;
crc = detail::Crc32Table[0][x >> 56] ^ detail::Crc32Table[1][(x >> 48) & 0xFF] ^
detail::Crc32Table[2][(x >> 40) & 0xFF] ^ detail::Crc32Table[3][(x >> 32) & 0xFF] ^
detail::Crc32Table[4][(x >> 24) & 0xFF] ^ detail::Crc32Table[5][(x >> 16) & 0xFF] ^
detail::Crc32Table[6][(x >> 8) & 0xFF] ^ detail::Crc32Table[7][x & 0xFF];
buffer += 8;
}
#else
while (length >= 8) {
const uint64_t x = *reinterpret_cast<const uint64_t*>(buffer) ^ crc;
crc = detail::Crc32Table[0][x >> 56] ^ detail::Crc32Table[1][(x >> 48) & 0xFF] ^
detail::Crc32Table[2][(x >> 40) & 0xFF] ^ detail::Crc32Table[3][(x >> 32) & 0xFF] ^
detail::Crc32Table[4][(x >> 24) & 0xFF] ^ detail::Crc32Table[5][(x >> 16) & 0xFF] ^
detail::Crc32Table[6][(x >> 8) & 0xFF] ^ detail::Crc32Table[7][x & 0xFF];
buffer += 8;
length -= 8;
}
#endif // defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__)
if (length & 4) {
crc ^= *reinterpret_cast<const uint32_t*>(buffer);
crc = detail::Crc32Table[0][crc >> 24] ^ detail::Crc32Table[1][(crc >> 16) & 0xFF] ^
detail::Crc32Table[2][(crc >> 8) & 0xFF] ^ detail::Crc32Table[3][crc & 0xFF];
buffer += 4;
}
if (length & 2) {
crc ^= *reinterpret_cast<const uint16_t*>(buffer);
crc = (crc >> 16) ^ detail::Crc32Table[0][(crc >> 8) & 0xFF] ^ detail::Crc32Table[1][crc & 0xFF];
buffer += 2;
}
if (length & 1) {
crc = (crc >> 8) ^ detail::Crc32Table[0][(crc & 0xFF) ^ *buffer++];
}
return ~crc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment