Created
March 5, 2025 17:46
-
-
Save iomoath/33091d9478d626ab067924703b8ba992 to your computer and use it in GitHub Desktop.
SimHash implementation C#
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // https://asecuritysite.com/hashsim/go_simhash | |
| // The Charikar similarity method [1] is often used for documents and metadata in order to located duplicates or clustered objects https://dl.acm.org/doi/pdf/10.1145/509907.509965 A value of zero means no differences, and the higher the simularity value, the more there are changes. | |
| private static void TestSimHash() | |
| { | |
| var text1 = "this is the first string"; | |
| var text2 = "this is the second string"; | |
| var fs1 = new WordFeatureSet(text1); | |
| var fs2 = new WordFeatureSet(text2); | |
| var hash1 = SimhashUtil.SimhashFromFeatureSet(fs1); | |
| var hash2 = SimhashUtil.SimhashFromFeatureSet(fs2); | |
| Console.WriteLine($"Simhash of `{text1}`: {hash1:x16}"); | |
| Console.WriteLine($"Simhash of `{text2}`: {hash2:x16}"); | |
| var distance = SimhashUtil.Compare(hash1, hash2); | |
| Console.WriteLine($"Comparison of `{text1}` and `{text2}`: {distance}"); | |
| // Optionally, you can test shingling. | |
| var words = text1.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | |
| var shingles = Shingler.Shingle(2, words); | |
| Console.WriteLine("2-shingles for text1:"); | |
| foreach (var shingle in shingles) | |
| { | |
| Console.WriteLine(shingle); | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System; | |
| using System.Collections.Generic; | |
| using System.Text; | |
| using System.Text.RegularExpressions; | |
| // original implementation: https://github.com/mfonda/simhash/blob/master/simhash.go | |
| // IFeature corresponds to the Go Feature interface. | |
| public interface IFeature | |
| { | |
| ulong Sum(); | |
| int Weight { get; } | |
| } | |
| // IFeatureSet corresponds to the Go FeatureSet interface. | |
| public interface IFeatureSet | |
| { | |
| IEnumerable<IFeature> GetFeatures(); | |
| } | |
| public static class SimhashUtil | |
| { | |
| public const int VectorSize = 64; | |
| /// <summary> | |
| /// Given a set of features, creates a 64-dimensional vector. | |
| /// For each feature, each bit contributes its weight to the corresponding index. | |
| /// </summary> | |
| public static int[] Vectorize(IEnumerable<IFeature> features) | |
| { | |
| int[] vector = new int[VectorSize]; | |
| foreach (var feature in features) | |
| { | |
| ulong sum = feature.Sum(); | |
| int weight = feature.Weight; | |
| for (int i = 0; i < VectorSize; i++) | |
| { | |
| ulong bit = (sum >> i) & 1UL; | |
| if (bit == 1UL) | |
| vector[i] += weight; | |
| else | |
| vector[i] -= weight; | |
| } | |
| } | |
| return vector; | |
| } | |
| /// <summary> | |
| /// Given a set of byte arrays (each treated as a feature with even weight), creates a 64-dimensional vector. | |
| /// </summary> | |
| public static int[] VectorizeBytes(IEnumerable<byte[]> features) | |
| { | |
| int[] vector = new int[VectorSize]; | |
| foreach (var feature in features) | |
| { | |
| ulong sum = FnvHash64(feature); | |
| for (int i = 0; i < VectorSize; i++) | |
| { | |
| ulong bit = (sum >> i) & 1UL; | |
| if (bit == 1UL) | |
| vector[i]++; | |
| else | |
| vector[i]--; | |
| } | |
| } | |
| return vector; | |
| } | |
| /// <summary> | |
| /// Converts the vector into a 64-bit fingerprint. | |
| /// For each component, if the value is non-negative, the corresponding bit is set. | |
| /// </summary> | |
| public static ulong Fingerprint(int[] vector) | |
| { | |
| ulong f = 0; | |
| for (int i = 0; i < VectorSize; i++) | |
| { | |
| if (vector[i] >= 0) | |
| f |= (1UL << i); | |
| } | |
| return f; | |
| } | |
| /// <summary> | |
| /// Computes the 64-bit simhash for a given feature set. | |
| /// </summary> | |
| public static ulong SimhashFromFeatureSet(IFeatureSet fs) | |
| { | |
| return Fingerprint(Vectorize(fs.GetFeatures())); | |
| } | |
| /// <summary> | |
| /// Computes the 64-bit simhash for a given collection of byte arrays. | |
| /// </summary> | |
| public static ulong SimhashFromBytes(IEnumerable<byte[]> bytes) | |
| { | |
| return Fingerprint(VectorizeBytes(bytes)); | |
| } | |
| /// <summary> | |
| /// Computes the Hamming distance between two 64-bit numbers using the Kernighan method. | |
| /// </summary> | |
| public static byte Compare(ulong a, ulong b) | |
| { | |
| ulong v = a ^ b; | |
| byte count = 0; | |
| while (v != 0) | |
| { | |
| count++; | |
| v &= v - 1; | |
| } | |
| return count; | |
| } | |
| /// <summary> | |
| /// Computes the FNV-1 64-bit hash for the given byte array. | |
| /// </summary> | |
| public static ulong FnvHash64(byte[] data) | |
| { | |
| const ulong offsetBasis = 14695981039346656037; | |
| const ulong prime = 1099511628211; | |
| ulong hash = offsetBasis; | |
| foreach (byte b in data) | |
| { | |
| hash *= prime; | |
| hash ^= b; | |
| } | |
| return hash; | |
| } | |
| } | |
| /// <summary> | |
| /// Implementation of IFeature. Each Feature stores a 64-bit hash (computed using FNV-1) and a weight. | |
| /// </summary> | |
| public class Feature : IFeature | |
| { | |
| private readonly ulong _sum; | |
| public int Weight { get; private set; } | |
| public Feature(byte[] data, int weight = 1) | |
| { | |
| _sum = SimhashUtil.FnvHash64(data); | |
| Weight = weight; | |
| } | |
| public ulong Sum() | |
| { | |
| return _sum; | |
| } | |
| } | |
| /// <summary> | |
| /// A word-based feature set where each word (as matched by a regex) is treated as a feature. | |
| /// The input text is converted to lower-case. | |
| /// </summary> | |
| public class WordFeatureSet : IFeatureSet | |
| { | |
| private readonly string _text; | |
| // Matches words and optional URLs (similar to Go boundaries regex). | |
| private static readonly Regex Boundaries = new Regex(@"[\w']+(?:\://[\w\./]+){0,1}", RegexOptions.Compiled); | |
| public WordFeatureSet(string text) | |
| { | |
| // Normalize by converting to lower-case. | |
| this._text = text.ToLowerInvariant(); | |
| } | |
| public IEnumerable<IFeature> GetFeatures() | |
| { | |
| var features = new List<IFeature>(); | |
| foreach (Match match in Boundaries.Matches(_text)) | |
| { | |
| string word = match.Value; | |
| byte[] bytes = Encoding.UTF8.GetBytes(word); | |
| features.Add(new Feature(bytes)); | |
| } | |
| return features; | |
| } | |
| } | |
| /// <summary> | |
| /// A Unicode-aware word feature set which normalizes text using a specified Unicode normalization form. | |
| /// </summary> | |
| public class UnicodeWordFeatureSet : IFeatureSet | |
| { | |
| private readonly string _text; | |
| // Regex for Unicode words. Note: In .NET, \p{L} represents any letter. | |
| private static readonly Regex UnicodeBoundaries = new Regex(@"[\p{L}\-_']+", RegexOptions.Compiled); | |
| public UnicodeWordFeatureSet(string text, NormalizationForm normForm) | |
| { | |
| // Normalize and convert to lower-case. | |
| this._text = text.Normalize(normForm).ToLowerInvariant(); | |
| } | |
| public IEnumerable<IFeature> GetFeatures() | |
| { | |
| var features = new List<IFeature>(); | |
| foreach (Match match in UnicodeBoundaries.Matches(_text)) | |
| { | |
| string word = match.Value; | |
| byte[] bytes = Encoding.UTF8.GetBytes(word); | |
| features.Add(new Feature(bytes)); | |
| } | |
| return features; | |
| } | |
| } | |
| /// <summary> | |
| /// Utility class for computing w-shingles (contiguous word groups). | |
| /// </summary> | |
| public static class Shingler | |
| { | |
| /// <summary> | |
| /// Returns the w-shingling of the given array of words. | |
| /// For example, for w = 2 and input {"this", "is", "a", "test"}, | |
| /// it returns {"this is", "is a", "a test"}. | |
| /// </summary> | |
| public static string[] Shingle(int w, string[] words) | |
| { | |
| if (w < 1) | |
| throw new ArgumentException("w must be a positive integer", nameof(w)); | |
| if (w == 1) | |
| return words; | |
| if (w > words.Length) | |
| w = words.Length; | |
| int count = words.Length - w + 1; | |
| string[] shingles = new string[count]; | |
| for (int i = 0; i < count; i++) | |
| { | |
| shingles[i] = string.Join(" ", words, i, w); | |
| } | |
| return shingles; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment