Created
November 29, 2023 01:42
-
-
Save SchrodingerZhu/778a7952a35a1fc3692650b2d9fd6686 to your computer and use it in GitHub Desktop.
SMHasher Patch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 7db446a0b8d2e29cd648fb5bf4224db9aed30905 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/Hashes.h b/Hashes.h | |
| index 0071a6b..59bd2c4 100644 | |
| --- a/Hashes.h | |
| +++ b/Hashes.h | |
| @@ -1374,3 +1374,173 @@ void khashv64_test ( const void *key, int len, uint32_t seed, void *out); | |
| extern PolymurHashParams g_polymurhashparams; | |
| void polymur_seed_init (size_t &seed); | |
| void polymur_test ( const void *key, int len, uint32_t seed, void *out); | |
| + | |
| +#include <cstddef> | |
| +#include <cstdint> | |
| +namespace llvm_libc_hash { | |
| + //===-- Portable string hash function ---------------------------*- C++ -*-===// | |
| +// | |
| +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |
| +// See https://llvm.org/LICENSE.txt for license information. | |
| +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |
| +// | |
| +//===----------------------------------------------------------------------===// | |
| + | |
| + | |
| +#define LIBC_INLINE inline | |
| +#define LIBC_INLINE_VAR inline | |
| +using UInt128 = __uint128_t; | |
| + | |
| +// Folded multiplication. | |
| +// This function multiplies two 64-bit integers and xor the high and | |
| +// low 64-bit parts of the result. | |
| +LIBC_INLINE static uint64_t folded_multiply(uint64_t x, uint64_t y) { | |
| + UInt128 mask = static_cast<UInt128>(0xffffffffffffffff); | |
| + UInt128 p = static_cast<UInt128>(x) * static_cast<UInt128>(y); | |
| + uint64_t low = static_cast<uint64_t>(p & mask); | |
| + uint64_t high = static_cast<uint64_t>(p >> 64); | |
| + return low ^ high; | |
| +} | |
| + | |
| +// Read as little endian. | |
| +// Shift-and-or implementation does not give a satisfactory code on aarch64. | |
| +// Therefore, we use a union to read the value. | |
| +template <typename T> | |
| +LIBC_INLINE static T read_little_endian(const void *ptr) { | |
| + const uint8_t *bytes = static_cast<const uint8_t *>(ptr); | |
| + union { | |
| + T value; | |
| + uint8_t buffer[sizeof(T)]; | |
| + } data; | |
| + #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ | |
| + for (size_t i = 0; i < sizeof(T); ++i) { | |
| + data.buffer[i] = bytes[sizeof(T) - i - 1]; | |
| + } | |
| + #else | |
| + for (size_t i = 0; i < sizeof(T); ++i) { | |
| + data.buffer[i] = bytes[i]; | |
| + } | |
| + #endif | |
| + return data.value; | |
| +} | |
| + | |
| + | |
| +// Specialized read functions for small values. size must be <= 8. | |
| +LIBC_INLINE static void read_small_values(const void *ptr, size_t size, | |
| + uint64_t &low, uint64_t &high) { | |
| + const uint8_t *bytes = static_cast<const uint8_t *>(ptr); | |
| + if (size >= 2) { | |
| + if (size >= 4) { | |
| + low = static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[0])); | |
| + high = static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[size - 4])); | |
| + } else { | |
| + low = static_cast<uint64_t>(read_little_endian<uint16_t>(&bytes[0])); | |
| + high = static_cast<uint64_t>(bytes[size - 1]); | |
| + } | |
| + } else { | |
| + if (size > 0) { | |
| + low = static_cast<uint64_t>(bytes[0]); | |
| + high = static_cast<uint64_t>(bytes[0]); | |
| + } else { | |
| + low = 0; | |
| + high = 0; | |
| + } | |
| + } | |
| +} | |
| + | |
| +// This constant comes from Kunth's prng (it empirically works well). | |
| +LIBC_INLINE_VAR static constexpr uint64_t MULTIPLE = 6364136223846793005; | |
| +// Rotation amount for mixing. | |
| +LIBC_INLINE_VAR static constexpr uint64_t ROTATE = 23; | |
| + | |
| +// Randomly generated values (for now, it uses the same values from aHash). | |
| +LIBC_INLINE_VAR static constexpr uint64_t RANDOMNESS[2][4] = { | |
| + 0x243f6a8885a308d3, | |
| + 0x13198a2e03707344, | |
| + 0xa4093822299f31d0, | |
| + 0x082efa98ec4e6c89, | |
| + 0x452821e638d01377, | |
| + 0xbe5466cf34e90c6c, | |
| + 0xc0ac29b7c97c50dd, | |
| + 0x3f84d5b5b5470917, | |
| +}; | |
| + | |
| + | |
| +LIBC_INLINE static uint64_t rotate_left(uint64_t x, uint64_t y) { | |
| + return (x << y) | (x >> (64 - y)); | |
| +} | |
| + | |
| +// This is a portable string hasher. It is not cryptographically secure. | |
| +// The implementation is extracted from the generic routine of aHash. | |
| +class HashState { | |
| + uint64_t buffer; | |
| + uint64_t pad; | |
| + uint64_t extra_keys[2]; | |
| + LIBC_INLINE void update(uint64_t low, uint64_t high) { | |
| + uint64_t combined = | |
| + folded_multiply(low ^ extra_keys[0], high ^ extra_keys[1]); | |
| + buffer = (buffer + pad) ^ combined; | |
| + buffer = rotate_left(buffer, ROTATE); | |
| + } | |
| + | |
| + LIBC_INLINE HashState( | |
| + uint64_t a, uint64_t b, uint64_t c, uint64_t d | |
| + ) : buffer(a), pad(b), extra_keys{c, d} { | |
| + } | |
| + | |
| + LIBC_INLINE static uint64_t mix(uint64_t seed) { | |
| + HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2], | |
| + RANDOMNESS[0][3]}; | |
| + mixer.update(seed, 0); | |
| + return mixer.finish(); | |
| + } | |
| + | |
| +public: | |
| + LIBC_INLINE HashState(uint64_t seed) { | |
| + // Mix one more round of the seed to make it | |
| + uint64_t mixed = mix(seed); | |
| + buffer = RANDOMNESS[1][0] ^ mixed; | |
| + pad = RANDOMNESS[1][1] ^ mixed; | |
| + extra_keys[0] = RANDOMNESS[1][2] ^ mixed; | |
| + extra_keys[1] = RANDOMNESS[1][3] ^ mixed; | |
| + } | |
| + LIBC_INLINE void update(const void *ptr, size_t size) { | |
| + uint8_t const *bytes = static_cast<const uint8_t *>(ptr); | |
| + buffer = (buffer + size) * MULTIPLE; | |
| + uint64_t low, high; | |
| + if (size > 8) { | |
| + if (size > 16) { | |
| + // update tail | |
| + low = read_little_endian<uint64_t>(&bytes[size - 16]); | |
| + high = read_little_endian<uint64_t>(&bytes[size - 8]); | |
| + update(low, high); | |
| + while (size > 16) { | |
| + low = read_little_endian<uint64_t>(&bytes[0]); | |
| + high = read_little_endian<uint64_t>(&bytes[8]); | |
| + update(low, high); | |
| + bytes += 16; | |
| + size -= 16; | |
| + } | |
| + } else { | |
| + low = read_little_endian<uint64_t>(&bytes[0]); | |
| + high = read_little_endian<uint64_t>(&bytes[size - 8]); | |
| + update(low, high); | |
| + } | |
| + } else { | |
| + read_small_values(ptr, size, low, high); | |
| + update(low, high); | |
| + } | |
| + } | |
| + LIBC_INLINE uint64_t finish() { | |
| + uint64_t rot = buffer & 63; | |
| + uint64_t folded = folded_multiply(buffer, pad); | |
| + return rotate_left(folded, rot); | |
| + } | |
| +}; | |
| +} | |
| + | |
| +static inline void llvm_libc_hash64 ( const void *key, int len, uint32_t seed, void *out) { | |
| + llvm_libc_hash::HashState state (seed); | |
| + state.update (key, len); | |
| + *(uint64_t*)out = state.finish(); | |
| +} | |
| diff --git a/PMP_Multilinear.h b/PMP_Multilinear.h | |
| index 9d24e73..42fdc5d 100644 | |
| --- a/PMP_Multilinear.h | |
| +++ b/PMP_Multilinear.h | |
| @@ -1417,8 +1417,8 @@ class PMP_Multilinear_Hasher | |
| lowProduct.HighPart = midProduct.LowPart; | |
| uint32_t hiProduct = c_ctr.HighPart * prevConstTerm.HighPart + midProduct.HighPart; | |
| - constTerm.QuadPart += lowProduct.QuadPart; | |
| - ctr += hiProduct + ( constTerm.QuadPart < lowProduct.QuadPart ); | |
| + constTerm.QuadPart += lowProduct.QuadPart; | |
| + ctr.QuadPart += hiProduct + ( constTerm.QuadPart < lowProduct.QuadPart ); | |
| /* for ( uint32_t i=0; i<PMPML_CHUNK_SIZE; i+=8 ) | |
| { | |
| diff --git a/main.cpp b/main.cpp | |
| index eaaa0bd..d2029f4 100644 | |
| --- a/main.cpp | |
| +++ b/main.cpp | |
| @@ -88,6 +88,7 @@ const char* quality_str[3] = { "SKIP", "POOR", "GOOD" }; | |
| // marked with !! are known bad seeds, which either hash to 0 or create collisions. | |
| HashInfo g_hashes[] = | |
| { | |
| + { llvm_libc_hash64, 64, 0x00000000, "llvm_libc_hash64", "llvm_libc_hash 64bit", GOOD, {} }, | |
| // first the bad hash funcs, failing tests: | |
| { DoNothingHash, 32, 0x0, "donothing32", "Do-Nothing function (measure call overhead)", SKIP, {0UL} /* !! */ }, | |
| { DoNothingHash, 64, 0x0, "donothing64", "Do-Nothing function (measure call overhead)", SKIP, {0ULL} /* !! */ }, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment