Skip to content

Instantly share code, notes, and snippets.

@thetarnav
Last active May 22, 2024 14:12
Show Gist options
  • Select an option

  • Save thetarnav/64df7acfa5f9e655bae758f2b5ace064 to your computer and use it in GitHub Desktop.

Select an option

Save thetarnav/64df7acfa5f9e655bae758f2b5ace064 to your computer and use it in GitHub Desktop.
Fuzzy Search JS
const MAX_UNICODE = 1114111
let bit_array = new Int32Array(MAX_UNICODE).fill(~0)
/**
* This algorithm determines if a given short string exists in a larger string with k errors.
* It's useful for search-as-you-type features.
*
* @param {string} text
* @param {string} pattern
* @returns -1 when not found, otherwise the index of the first occurrence of the pattern in the text
*/
function bitap_search(text, pattern) {
let len = pattern.length
let R = ~1
if (len === 0 || len > text.length) {
return -1
}
bit_array.fill(~0) // reset
for (let i = 0; i < len; i++) {
bit_array[pattern.charCodeAt(i)] &= ~(1 << i)
}
for (let i = 0; i < text.length; i++) {
R |= bit_array[text.charCodeAt(i)]
R <<= 1
if ((R & (1 << len)) === 0) {
return i - len + 1
}
}
return -1
}
/*
The JavaScript implementation of version v0.2.0 of the __fts_fuzzy_match__ algorithm by Forrest Smith,
updated by nrgwsth, to include the "exhaustive search" functionality
described in the 2017 update to the _Reverse Engineering Sublime Text’s Fuzzy Match_ article.
Copied from https://github.com/tajmone/fuzzy-search/blob/master/fts_fuzzy_match/0.2.0/js
*/
const BONUS_SEQUENTIAL = 15 // bonus for adjacent matches
const BONUS_SEPARATOR = 30 // bonus if match occurs after a separator
const BONUS_CAMEL = 30 // bonus if match is uppercase and prev is lower
const BONUS_FIRST_LETTER = 15 // bonus if the first letter is matched
const PENALTY_LEADING_LETTER = -5 // penalty applied for every letter in str before the first match
const PENALTY_MAX_LEADING_LETTER = -15 // maximum penalty for leading letters
const PENALTY_UNMATCHED_LETTER = -1
/**
* Returns true if each character in pattern is found sequentially within str
*
* @param {string} pattern
* @param {string} str
* @returns {boolean} */
export function fuzzy_match_simple(pattern, str) {
let pattern_idx = 0
let str_idx = 0
while (pattern_idx !== pattern.length && str_idx !== str.length) {
const pattern_char = pattern.charAt(pattern_idx).toLowerCase()
const str_char = str .charAt(str_idx) .toLowerCase()
if (pattern_char === str_char) {
pattern_idx += 1
}
str_idx += 1
}
return pattern.length !== 0 && str.length !== 0 && pattern_idx === pattern.length
}
/**
* Does a fuzzy search to find pattern inside a string.
*
* @param {string} pattern pattern to search for
* @param {string} str string which is being searched
* @returns {[boolean, number]} a boolean which tells if pattern was
* found or not and a search score
*/
export function fuzzy_match(pattern, str) {
return _fuzzy_match(
pattern,
str,
0, /* pattern_cur_index */
0, /* str_curr_index */
null, /* src_matces */
[], /* matches */
256, /* max_matches */
0, /* next_match */
0, /* recursion_count */
10, /* recursion_limit */
);
}
/**
* @param {string } pattern
* @param {string } str
* @param {number } pattern_cur_index
* @param {number } str_curr_index
* @param {number[] | null } src_matces
* @param {number[] } matches
* @param {number } max_matches
* @param {number } next_match
* @param {number } recursion_count
* @param {number } recursion_limit
* @returns {[boolean, number]} */
function _fuzzy_match(
pattern,
str,
pattern_cur_index,
str_curr_index,
src_matces,
matches,
max_matches,
next_match,
recursion_count,
recursion_limit,
) {
let out_score = 0
// Return if recursion limit is reached.
if (++recursion_count >= recursion_limit) {
return [false, out_score]
}
// Return if we reached ends of strings.
if (pattern_cur_index === pattern.length || str_curr_index === str.length) {
return [false, out_score]
}
// Recursion params
let recursive_match = false
let best_recursive_matches = []
let best_recursive_score = 0
// Loop through pattern and str looking for a match.
let first_match = true
while (pattern_cur_index < pattern.length && str_curr_index < str.length) {
// Match found.
if (
pattern[pattern_cur_index].toLowerCase() === str[str_curr_index].toLowerCase()
) {
if (next_match >= max_matches) {
return [false, out_score]
}
if (first_match && src_matces) {
matches = [...src_matces]
first_match = false
}
let recursive_matches = []
const [matched, recursiveScore] = _fuzzy_match(
pattern,
str,
pattern_cur_index,
str_curr_index + 1,
matches,
recursive_matches,
max_matches,
next_match,
recursion_count,
recursion_limit
)
if (matched) {
// Pick best recursive score.
if (!recursive_match || recursiveScore > best_recursive_score) {
best_recursive_matches = [...recursive_matches]
best_recursive_score = recursiveScore
}
recursive_match = true
}
matches[next_match++] = str_curr_index
++pattern_cur_index
}
++str_curr_index
}
const matched = pattern_cur_index === pattern.length
if (matched) {
out_score = 100
// Apply leading letter penalty
let penalty = PENALTY_LEADING_LETTER * matches[0]
penalty = penalty < PENALTY_MAX_LEADING_LETTER
? PENALTY_MAX_LEADING_LETTER
: penalty
out_score += penalty
// Apply unmatched penalty
out_score += PENALTY_UNMATCHED_LETTER * (str.length - next_match)
// Apply ordering bonuses
for (let i = 0; i < next_match; i++) {
const curr_idx = matches[i]
if (i > 0) {
const prev_idx = matches[i - 1]
if (curr_idx == prev_idx + 1) {
out_score += BONUS_SEQUENTIAL
}
}
// Check for bonuses based on neighbor character value.
if (curr_idx > 0) {
// Camel case
const neighbor = str[curr_idx - 1]
const curr = str[curr_idx]
if (
neighbor === neighbor.toLowerCase() &&
curr === curr.toUpperCase()
) {
out_score += BONUS_CAMEL
}
// is neighbor separator
if (neighbor == "_" || neighbor == " ") {
out_score += BONUS_SEPARATOR
}
} else {
// First letter
out_score += BONUS_FIRST_LETTER
}
}
// Return best result
if (recursive_match && (!matched || best_recursive_score > out_score)) {
// Recursive score is better than "this"
matches = [...best_recursive_matches]
out_score = best_recursive_score
return [true, out_score]
} else if (matched) {
// "this" score is better than recursive
return [true, out_score]
} else {
return [false, out_score]
}
}
return [false, out_score]
}
/**
* calculates the minimum number of single-character edits \
* (insertions, deletions, or substitutions) \
* required to change one word into another. \
* It's a good choice for spell checking or auto-correct features.
*
* @param {string} a
* @param {string} b
* @returns {number} the Levenshtein distance between the two strings
* - 0 if the strings are equal
* - a positive integer if the strings are different
*/
function levenshtein_distance(a, b) {
let mat = new Array(b.length + 1)
for (let i = 0; i <= b.length; i += 1) {
mat[i] = new Array(a.length + 1)
mat[i][0] = i
}
for (let j = 0; j <= a.length; j += 1) {
mat[0][j] = j
}
for (let i = 1; i <= b.length; i += 1) {
for (let j = 1; j <= a.length; j += 1) {
if (b[i-1] == a[j-1]) {
mat[i][j] = mat[i-1][j-1]
} else {
mat[i][j] = Math.min(
mat[i-1][j-1] + 1, // substitution
mat[i ][j-1] + 1, // insertion
mat[i-1][j ] + 1, // deletion
)
}
}
}
return mat[b.length][a.length]
}
/**
* @param {string } query
* @param {string[]} strings
* @param {number } max_results
* @returns indexes array, pointing to the strings array
*/
export function search_by_levenshtein_distance(query, strings, max_results) {
/** @type {number[]} */ let indexes = new Array(max_results)
/** @type {number[]} */ let scores = new Array(max_results)
let len = 0
strings_loop: for (let str_idx = 0; str_idx < strings.length; str_idx += 1) {
let string = strings[str_idx]
let score = levenshtein_distance(string, query)
for (let i = 0; i < len; i += 1) {
if (score < scores[i]) {
// shift to the right
for (let j = len; j > i; j -= 1) {
indexes[j] = indexes[j-1]
scores [j] = scores [j-1]
}
scores [i] = score
indexes[i] = str_idx
continue strings_loop
}
}
if (len < max_results) {
scores [len] = score
indexes[len] = str_idx
len += 1
}
}
if (len < max_results) {
indexes = indexes.slice(0, len)
}
return indexes
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment