Skip to content

Instantly share code, notes, and snippets.

@iomoath
Created March 5, 2025 17:46
Show Gist options
  • Select an option

  • Save iomoath/33091d9478d626ab067924703b8ba992 to your computer and use it in GitHub Desktop.

Select an option

Save iomoath/33091d9478d626ab067924703b8ba992 to your computer and use it in GitHub Desktop.
SimHash implementation C#
// https://asecuritysite.com/hashsim/go_simhash
// The Charikar similarity method [1] is often used for documents and metadata in order to located duplicates or clustered objects https://dl.acm.org/doi/pdf/10.1145/509907.509965 A value of zero means no differences, and the higher the simularity value, the more there are changes.
private static void TestSimHash()
{
var text1 = "this is the first string";
var text2 = "this is the second string";
var fs1 = new WordFeatureSet(text1);
var fs2 = new WordFeatureSet(text2);
var hash1 = SimhashUtil.SimhashFromFeatureSet(fs1);
var hash2 = SimhashUtil.SimhashFromFeatureSet(fs2);
Console.WriteLine($"Simhash of `{text1}`: {hash1:x16}");
Console.WriteLine($"Simhash of `{text2}`: {hash2:x16}");
var distance = SimhashUtil.Compare(hash1, hash2);
Console.WriteLine($"Comparison of `{text1}` and `{text2}`: {distance}");
// Optionally, you can test shingling.
var words = text1.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
var shingles = Shingler.Shingle(2, words);
Console.WriteLine("2-shingles for text1:");
foreach (var shingle in shingles)
{
Console.WriteLine(shingle);
}
}
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
// original implementation: https://github.com/mfonda/simhash/blob/master/simhash.go
// IFeature corresponds to the Go Feature interface.
public interface IFeature
{
ulong Sum();
int Weight { get; }
}
// IFeatureSet corresponds to the Go FeatureSet interface.
public interface IFeatureSet
{
IEnumerable<IFeature> GetFeatures();
}
public static class SimhashUtil
{
public const int VectorSize = 64;
/// <summary>
/// Given a set of features, creates a 64-dimensional vector.
/// For each feature, each bit contributes its weight to the corresponding index.
/// </summary>
public static int[] Vectorize(IEnumerable<IFeature> features)
{
int[] vector = new int[VectorSize];
foreach (var feature in features)
{
ulong sum = feature.Sum();
int weight = feature.Weight;
for (int i = 0; i < VectorSize; i++)
{
ulong bit = (sum >> i) & 1UL;
if (bit == 1UL)
vector[i] += weight;
else
vector[i] -= weight;
}
}
return vector;
}
/// <summary>
/// Given a set of byte arrays (each treated as a feature with even weight), creates a 64-dimensional vector.
/// </summary>
public static int[] VectorizeBytes(IEnumerable<byte[]> features)
{
int[] vector = new int[VectorSize];
foreach (var feature in features)
{
ulong sum = FnvHash64(feature);
for (int i = 0; i < VectorSize; i++)
{
ulong bit = (sum >> i) & 1UL;
if (bit == 1UL)
vector[i]++;
else
vector[i]--;
}
}
return vector;
}
/// <summary>
/// Converts the vector into a 64-bit fingerprint.
/// For each component, if the value is non-negative, the corresponding bit is set.
/// </summary>
public static ulong Fingerprint(int[] vector)
{
ulong f = 0;
for (int i = 0; i < VectorSize; i++)
{
if (vector[i] >= 0)
f |= (1UL << i);
}
return f;
}
/// <summary>
/// Computes the 64-bit simhash for a given feature set.
/// </summary>
public static ulong SimhashFromFeatureSet(IFeatureSet fs)
{
return Fingerprint(Vectorize(fs.GetFeatures()));
}
/// <summary>
/// Computes the 64-bit simhash for a given collection of byte arrays.
/// </summary>
public static ulong SimhashFromBytes(IEnumerable<byte[]> bytes)
{
return Fingerprint(VectorizeBytes(bytes));
}
/// <summary>
/// Computes the Hamming distance between two 64-bit numbers using the Kernighan method.
/// </summary>
public static byte Compare(ulong a, ulong b)
{
ulong v = a ^ b;
byte count = 0;
while (v != 0)
{
count++;
v &= v - 1;
}
return count;
}
/// <summary>
/// Computes the FNV-1 64-bit hash for the given byte array.
/// </summary>
public static ulong FnvHash64(byte[] data)
{
const ulong offsetBasis = 14695981039346656037;
const ulong prime = 1099511628211;
ulong hash = offsetBasis;
foreach (byte b in data)
{
hash *= prime;
hash ^= b;
}
return hash;
}
}
/// <summary>
/// Implementation of IFeature. Each Feature stores a 64-bit hash (computed using FNV-1) and a weight.
/// </summary>
public class Feature : IFeature
{
private readonly ulong _sum;
public int Weight { get; private set; }
public Feature(byte[] data, int weight = 1)
{
_sum = SimhashUtil.FnvHash64(data);
Weight = weight;
}
public ulong Sum()
{
return _sum;
}
}
/// <summary>
/// A word-based feature set where each word (as matched by a regex) is treated as a feature.
/// The input text is converted to lower-case.
/// </summary>
public class WordFeatureSet : IFeatureSet
{
private readonly string _text;
// Matches words and optional URLs (similar to Go boundaries regex).
private static readonly Regex Boundaries = new Regex(@"[\w']+(?:\://[\w\./]+){0,1}", RegexOptions.Compiled);
public WordFeatureSet(string text)
{
// Normalize by converting to lower-case.
this._text = text.ToLowerInvariant();
}
public IEnumerable<IFeature> GetFeatures()
{
var features = new List<IFeature>();
foreach (Match match in Boundaries.Matches(_text))
{
string word = match.Value;
byte[] bytes = Encoding.UTF8.GetBytes(word);
features.Add(new Feature(bytes));
}
return features;
}
}
/// <summary>
/// A Unicode-aware word feature set which normalizes text using a specified Unicode normalization form.
/// </summary>
public class UnicodeWordFeatureSet : IFeatureSet
{
private readonly string _text;
// Regex for Unicode words. Note: In .NET, \p{L} represents any letter.
private static readonly Regex UnicodeBoundaries = new Regex(@"[\p{L}\-_']+", RegexOptions.Compiled);
public UnicodeWordFeatureSet(string text, NormalizationForm normForm)
{
// Normalize and convert to lower-case.
this._text = text.Normalize(normForm).ToLowerInvariant();
}
public IEnumerable<IFeature> GetFeatures()
{
var features = new List<IFeature>();
foreach (Match match in UnicodeBoundaries.Matches(_text))
{
string word = match.Value;
byte[] bytes = Encoding.UTF8.GetBytes(word);
features.Add(new Feature(bytes));
}
return features;
}
}
/// <summary>
/// Utility class for computing w-shingles (contiguous word groups).
/// </summary>
public static class Shingler
{
/// <summary>
/// Returns the w-shingling of the given array of words.
/// For example, for w = 2 and input {"this", "is", "a", "test"},
/// it returns {"this is", "is a", "a test"}.
/// </summary>
public static string[] Shingle(int w, string[] words)
{
if (w < 1)
throw new ArgumentException("w must be a positive integer", nameof(w));
if (w == 1)
return words;
if (w > words.Length)
w = words.Length;
int count = words.Length - w + 1;
string[] shingles = new string[count];
for (int i = 0; i < count; i++)
{
shingles[i] = string.Join(" ", words, i, w);
}
return shingles;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment