Last active
June 12, 2025 04:36
-
-
Save notsatvrn/580ff92892f61b8a0d913725bcac89f1 to your computer and use it in GitHub Desktop.
microhash: a hash function optimized for small strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const std = @import("std"); | |
| const seed: u64 = 0x3243F6A8885A308D; // digits of pi | |
| // folded multiply with a faster 32-bit path | |
| // borrowed from foldhash and modified a bit | |
| inline fn mix(x: u64, y: u64) u64 { | |
| const builtin = @import("builtin"); | |
| const target = builtin.target; | |
| const cpu = builtin.cpu; | |
| // sparc64 and wasm64 do not have 128-bit widening multiplication | |
| // x86-64 and aarch64 should have it regardless of abi, but abi may reduce ptr bit width | |
| // Zig 0.14.1 doesn't have wide arithmetic in the wasm feature set, add a check for that eventually | |
| const wide_mul = (target.ptrBitWidth() == 64 and cpu.arch != .sparc64 and cpu.arch != .wasm64) or | |
| (cpu.arch == .x86_64 or cpu.arch.isAARCH64()); | |
| if (wide_mul) { | |
| const full = @as(u128, x) * @as(u128, y); | |
| const lo: u64 = @truncate(full); | |
| const hi: u64 = @truncate(full >> 64); | |
| return lo ^ hi; | |
| } | |
| // we don't need a super accurate approximation, so we do half the work here that foldhash does | |
| // this should still do a good job of mixing bits around, but it's a lot faster | |
| const lx: u32 = @truncate(x); | |
| const ly: u32 = @truncate(y); | |
| const hx: u32 = @truncate(x >> 32); | |
| const hy: u32 = @truncate(y >> 32); | |
| const ll = @as(u64, lx) * @as(u64, ly); | |
| const hh = @as(u64, hx) * @as(u64, hy); | |
| return ll ^ hh; | |
| } | |
| // safe readInt by copy to buffer for len <= 8 bytes | |
| inline fn readPartial(ptr: [*]const u8, len: usize) u64 { | |
| var buf: [8]u8 = .{0} ** 8; | |
| @memcpy(buf[0..8].ptr, ptr[0..len]); | |
| return std.mem.readInt(u64, &buf, .little); | |
| } | |
| /// Hash function optimized for small strings. | |
| pub fn microhash(input: []const u8) u64 { | |
| var ptr = input.ptr; | |
| var len = input.len; | |
| var out = seed; | |
| while (len > 16) : (len -= 16) { | |
| const x = std.mem.readInt(u64, @ptrCast(ptr), .little); | |
| const y = std.mem.readInt(u64, @ptrCast(ptr + 8), .little); | |
| out = mix(out ^ x, y); | |
| out = std.math.rotr(u64, out, 23); | |
| ptr += 16; | |
| } | |
| if (len > 8) { | |
| const x = std.mem.readInt(u64, @ptrCast(ptr), .little); | |
| const y = readPartial(ptr + 8, len - 8); | |
| out = mix(out ^ x, y); | |
| } else if (len > 0) { | |
| const x = readPartial(ptr, len); | |
| out = mix(out, x); | |
| } | |
| return out; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This algorithm is mainly inspired by foldhash and rustc-hash. I wanted a quick hash function for strings that I knew would be small, as I wouldn't have to worry so much about quality and could maximize performance. I think I succeeded in my goal.
I haven't run this through SMhasher or anything and I'm sure the quality is terrible. If you need a quality hash function, consider the former two mentioned or rapidhash. If you're more worried about performance instead, you'll be pleasantly surprised to know that this algorithm is anywhere from 1.5-2x faster than rapidhash for strings <= 64 bytes in size.