Last active
April 16, 2024 10:45
-
-
Save WiwilZ/667c08ae4373508cf4a016dd1e027310 to your computer and use it in GitHub Desktop.
Crc32 function optimized with SIMD Instrction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /******************generate bit-reflected constants******************** | |
| constexpr uint64_t Polynomial = 0x1db710641 ; // bit reverse for significant bits of 0x104c11db7 | |
| template <bool GetQuotient = false> | |
| constexpr uint64_t Generate(uint64_t deg) { | |
| uint64_t quot = 0; | |
| uint64_t crc = 2; | |
| for (size_t i = 0; i < deg - 31; i++) { | |
| crc >>= 1; | |
| if (crc & 1) { | |
| crc ^= Polynomial; | |
| quot |= 1ull << i; | |
| } | |
| } | |
| return GetQuotient ? quot : crc; | |
| } | |
| int main(int argc, char** argv) { | |
| // for avx512 | |
| for (auto i : {4 * 512 + 32, 4 * 512 - 32, 512 + 32, 512 - 32}) { | |
| std::cout << std::format("0x{:09x}\n", Generate(i)); | |
| } | |
| std::cout << "\n\n"; | |
| //for sse | |
| for (auto i : {4 * 128 + 32, 4 * 128 - 32, 128 + 32, 128 - 32}) { | |
| std::cout << std::format("0x{:09x}\n", Generate(i)); | |
| } | |
| std::cout << "\n\n"; | |
| // for commom | |
| std::cout << std::format("0x{:09x}\n", Generate(64)) | |
| << std::format("0x{:09x}\n", Generate(32)) | |
| << std::format("0x{:09x}\n", Generate<true>(64)); | |
| } | |
| */ | |
| #if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC)) | |
| #define __SSE4_2__ | |
| #define __PCLMUL__ | |
| #endif | |
| #include <cstddef> | |
| #include <cstdint> | |
| #if defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__) | |
| # if defined(_MSC_VER) && !defined(__clang__) | |
| #include <intrin.h> | |
| # else | |
| #include <smmintrin.h> | |
| #include <wmmintrin.h> | |
| #ifdef __AVX512F__ | |
| #include <immintrin.h> | |
| #endif | |
| # endif | |
| namespace detail { | |
| __m128i FoldTo128Bits(__m128i& x1, const __m128i x2, const __m128i x3, const __m128i x4) noexcept { | |
| const __m128i x0 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0); | |
| __m128i x5; | |
| x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, x2, x5, 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, x2), x5); | |
| #endif | |
| x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, x3, x5, 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, x3), x5); | |
| #endif | |
| x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, x4, x5, 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, x4), x5); | |
| #endif | |
| return x0; | |
| } | |
| uint32_t Fold128BitsToResult(__m128i x0, __m128i x1) noexcept { | |
| __m128i x2, x3; | |
| // Fold 128-bits to 64-bits | |
| x2 = _mm_clmulepi64_si128(x1, x0, 0x10); | |
| x3 = _mm_setr_epi32(-1, 0, -1, 0); | |
| x1 = _mm_srli_si128(x1, 8); | |
| x1 = _mm_xor_si128(x1, x2); | |
| x0 = _mm_cvtsi64_si128(0x163cd6124); | |
| x2 = _mm_srli_si128(x1, 4); | |
| x1 = _mm_and_si128(x1, x3); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_xor_si128(x1, x2); | |
| // Barret reduce to 32-bits | |
| x0 = _mm_set_epi64x(0x1f7011641, 0x1db710641); | |
| x2 = _mm_and_si128(x1, x3); | |
| x2 = _mm_clmulepi64_si128(x2, x0, 0x10); | |
| x2 = _mm_and_si128(x2, x3); | |
| x2 = _mm_clmulepi64_si128(x2, x0, 0x00); | |
| x1 = _mm_xor_si128(x1, x2); | |
| return _mm_extract_epi32(x1, 1); | |
| } | |
| template <size_t NumBytes> | |
| uint32_t Crc32(const void* data, size_t length, uint32_t crc = 0) noexcept; | |
| #ifdef __AVX512F__ | |
| template <> | |
| uint32_t Crc32<256>(const void* data, size_t length, uint32_t crc) noexcept { | |
| auto buffer = static_cast<const __m512i*>(data); | |
| // There's at least one block of 256 | |
| __m512i x1 = _mm512_xor_si512(_mm512_loadu_si512(buffer + 0), _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); | |
| __m512i x2 = _mm512_loadu_si512(buffer + 1); | |
| __m512i x3 = _mm512_loadu_si512(buffer + 2); | |
| __m512i x4 = _mm512_loadu_si512(buffer + 3); | |
| buffer += 4; | |
| length -= 256; | |
| __m512i x0 = _mm512_set_epi64(0x1322d1430, 0x11542778a, 0x1322d1430, 0x11542778a, | |
| 0x1322d1430, 0x11542778a, 0x1322d1430, 0x11542778a); | |
| __m512i x5, x6, x7, x8; | |
| // Parallel fold blocks of 256 | |
| while (length >= 256) { | |
| x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); | |
| x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); | |
| x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); | |
| x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); | |
| x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); | |
| x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); | |
| x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); | |
| x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); | |
| x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_loadu_si512(buffer + 0), 0x96); | |
| x2 = _mm512_ternarylogic_epi64(x2, x6, _mm512_loadu_si512(buffer + 1), 0x96); | |
| x3 = _mm512_ternarylogic_epi64(x3, x7, _mm512_loadu_si512(buffer + 2), 0x96); | |
| x4 = _mm512_ternarylogic_epi64(x4, x8, _mm512_loadu_si512(buffer + 3), 0x96); | |
| buffer += 4; | |
| length -= 256; | |
| } | |
| // Fold into 512-bits | |
| x0 = _mm512_set_epi64(0x1c6e41596, 0x154442bd4, 0x1c6e41596, 0x154442bd4, | |
| 0x1c6e41596, 0x154442bd4, 0x1c6e41596, 0x154442bd4); | |
| x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); | |
| x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); | |
| x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); | |
| x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); | |
| x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); | |
| x1 = _mm512_ternarylogic_epi64(x1, x3, x5, 0x96); | |
| x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); | |
| x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); | |
| x1 = _mm512_ternarylogic_epi64(x1, x4, x5, 0x96); | |
| // Single fold blocks of 64 | |
| while (length >= 64) { | |
| x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); | |
| x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); | |
| x1 = _mm512_ternarylogic_epi64(x1, _mm512_loadu_si512(buffer), x5, 0x96); | |
| ++buffer; | |
| length -= 64; | |
| } | |
| // Fold 512-bits to 128-bits | |
| __m128i a1 = _mm512_extracti32x4_epi32(x1, 0); | |
| const __m128i a2 = _mm512_extracti32x4_epi32(x1, 1); | |
| const __m128i a3 = _mm512_extracti32x4_epi32(x1, 2); | |
| const __m128i a4 = _mm512_extracti32x4_epi32(x1, 3); | |
| const __m128i a0 = FoldTo128Bits(a1, a2, a3, a4); | |
| return Fold128BitsToResult(a0, a1); | |
| } | |
| #endif // defined(__AVX512F__) | |
| template <> | |
| uint32_t Crc32<64>(const void* data, size_t length, uint32_t crc) noexcept { | |
| auto buffer = static_cast<const __m128i*>(data); | |
| // There's at least one block of 64 | |
| __m128i x1 = _mm_xor_si128(_mm_loadu_si128(buffer + 0), _mm_cvtsi32_si128(crc)); | |
| __m128i x2 = _mm_loadu_si128(buffer + 1); | |
| __m128i x3 = _mm_loadu_si128(buffer + 2); | |
| __m128i x4 = _mm_loadu_si128(buffer + 3); | |
| buffer += 4; | |
| length -= 64; | |
| __m128i x0 = _mm_set_epi64x(0x1c6e41596, 0x154442bd4); | |
| __m128i x5, x6, x7, x8; | |
| // Parallel fold blocks of 64 | |
| while (length >= 64) { | |
| x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x6 = _mm_clmulepi64_si128(x2, x0, 0x00); | |
| x7 = _mm_clmulepi64_si128(x3, x0, 0x00); | |
| x8 = _mm_clmulepi64_si128(x4, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| x2 = _mm_clmulepi64_si128(x2, x0, 0x11); | |
| x3 = _mm_clmulepi64_si128(x3, x0, 0x11); | |
| x4 = _mm_clmulepi64_si128(x4, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, x5, _mm_loadu_si128(buffer + 0), 0x96); | |
| x2 = _mm_ternarylogic_epi64(x2, x6, _mm_loadu_si128(buffer + 1), 0x96); | |
| x3 = _mm_ternarylogic_epi64(x3, x7, _mm_loadu_si128(buffer + 2), 0x96); | |
| x4 = _mm_ternarylogic_epi64(x4, x8, _mm_loadu_si128(buffer + 3), 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, x5), _mm_loadu_si128(buffer + 0)); | |
| x2 = _mm_xor_si128(_mm_xor_si128(x2, x6), _mm_loadu_si128(buffer + 1)); | |
| x3 = _mm_xor_si128(_mm_xor_si128(x3, x7), _mm_loadu_si128(buffer + 2)); | |
| x4 = _mm_xor_si128(_mm_xor_si128(x4, x8), _mm_loadu_si128(buffer + 3)); | |
| #endif | |
| buffer += 4; | |
| length -= 64; | |
| } | |
| // Fold into 128-bits | |
| x0 = FoldTo128Bits(x1, x2, x3, x4); | |
| // Single fold blocks of 16 | |
| while (length >= 16) { | |
| x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, _mm_loadu_si128(buffer), x5, 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, _mm_loadu_si128(buffer)), x5); | |
| #endif | |
| ++buffer; | |
| length -= 16; | |
| } | |
| return Fold128BitsToResult(x0, x1); | |
| } | |
| template <> | |
| uint32_t Crc32<16>(const void* data, size_t length, uint32_t crc) noexcept { | |
| auto buffer = static_cast<const __m128i*>(data); | |
| // There's at least one block of 16 | |
| __m128i x1 = _mm_xor_si128(_mm_loadu_si128(buffer++), _mm_cvtsi32_si128(crc)); | |
| const __m128i x0 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0); | |
| // Single fold blocks of 16 | |
| for (length -= 16; length >= 16; length -= 16) { | |
| const __m128i tmp = _mm_clmulepi64_si128(x1, x0, 0x00); | |
| x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | |
| #ifdef __AVX512VL__ | |
| x1 = _mm_ternarylogic_epi64(x1, _mm_loadu_si128(buffer++), tmp, 0x96); | |
| #else | |
| x1 = _mm_xor_si128(_mm_xor_si128(x1, _mm_loadu_si128(buffer)), tmp); | |
| #endif | |
| } | |
| return Fold128BitsToResult(x0, x1); | |
| } | |
| } // namespace detail | |
| #endif // defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__) | |
| #include <array> | |
| namespace detail { | |
| constexpr auto Crc32Table = [] { | |
| constexpr uint32_t Polynomial = 0xEDB88320; | |
| std::array<std::array<uint32_t, 256>, 8> table{}; | |
| for (uint32_t i = 0; i < table[0].size(); i++) { | |
| uint32_t crc = i; | |
| for (uint32_t j = 0; j < 8; j++) { | |
| crc = (crc >> 1) ^ (-(crc & 1) & Polynomial); | |
| } | |
| table[0][i] = crc; | |
| } | |
| for (uint32_t i = 0; i < table[0].size(); i++) { | |
| uint32_t crc = table[0][i]; | |
| for (uint32_t j = 1; j < table.size(); j++) { | |
| crc = (crc >> 8) ^ table[0][crc & 0xFF]; | |
| table[j][i] = crc; | |
| } | |
| } | |
| return table; | |
| }(); | |
| } | |
| uint32_t Crc32(const void* const data, size_t length, uint32_t crc = 0) noexcept { | |
| auto buffer = static_cast<const uint8_t*>(data); | |
| crc = ~crc; | |
| #if defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__) | |
| #ifdef __AVX512F__ | |
| if (length >= 256) { | |
| const size_t chunkSize = length & ~63; | |
| crc = detail::Crc32<256>(data, chunkSize, crc); | |
| length &= 63; | |
| if (length == 0) { | |
| return ~crc; | |
| } | |
| buffer += chunkSize; | |
| } | |
| #endif | |
| if (length >= 64) { | |
| const size_t chunkSize = length & ~15; | |
| crc = detail::Crc32<64>(data, chunkSize, crc); | |
| length &= 15; | |
| if (length == 0) { | |
| return ~crc; | |
| } | |
| buffer += chunkSize; | |
| } | |
| if (length >= 16) { | |
| const size_t chunkSize = length & ~15; | |
| crc = detail::Crc32<16>(data, chunkSize, crc); | |
| length &= 15; | |
| if (length == 0) { | |
| return ~crc; | |
| } | |
| buffer += chunkSize; | |
| } | |
| if (length & 8) { | |
| const uint64_t x = *reinterpret_cast<const uint64_t*>(buffer) ^ crc; | |
| crc = detail::Crc32Table[0][x >> 56] ^ detail::Crc32Table[1][(x >> 48) & 0xFF] ^ | |
| detail::Crc32Table[2][(x >> 40) & 0xFF] ^ detail::Crc32Table[3][(x >> 32) & 0xFF] ^ | |
| detail::Crc32Table[4][(x >> 24) & 0xFF] ^ detail::Crc32Table[5][(x >> 16) & 0xFF] ^ | |
| detail::Crc32Table[6][(x >> 8) & 0xFF] ^ detail::Crc32Table[7][x & 0xFF]; | |
| buffer += 8; | |
| } | |
| #else | |
| while (length >= 8) { | |
| const uint64_t x = *reinterpret_cast<const uint64_t*>(buffer) ^ crc; | |
| crc = detail::Crc32Table[0][x >> 56] ^ detail::Crc32Table[1][(x >> 48) & 0xFF] ^ | |
| detail::Crc32Table[2][(x >> 40) & 0xFF] ^ detail::Crc32Table[3][(x >> 32) & 0xFF] ^ | |
| detail::Crc32Table[4][(x >> 24) & 0xFF] ^ detail::Crc32Table[5][(x >> 16) & 0xFF] ^ | |
| detail::Crc32Table[6][(x >> 8) & 0xFF] ^ detail::Crc32Table[7][x & 0xFF]; | |
| buffer += 8; | |
| length -= 8; | |
| } | |
| #endif // defined(__AVX512F__) || defined(__SSE4_2__) && defined(__PCLMUL__) | |
| if (length & 4) { | |
| crc ^= *reinterpret_cast<const uint32_t*>(buffer); | |
| crc = detail::Crc32Table[0][crc >> 24] ^ detail::Crc32Table[1][(crc >> 16) & 0xFF] ^ | |
| detail::Crc32Table[2][(crc >> 8) & 0xFF] ^ detail::Crc32Table[3][crc & 0xFF]; | |
| buffer += 4; | |
| } | |
| if (length & 2) { | |
| crc ^= *reinterpret_cast<const uint16_t*>(buffer); | |
| crc = (crc >> 16) ^ detail::Crc32Table[0][(crc >> 8) & 0xFF] ^ detail::Crc32Table[1][crc & 0xFF]; | |
| buffer += 2; | |
| } | |
| if (length & 1) { | |
| crc = (crc >> 8) ^ detail::Crc32Table[0][(crc & 0xFF) ^ *buffer++]; | |
| } | |
| return ~crc; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment