Last active
May 22, 2024 14:12
-
-
Save thetarnav/64df7acfa5f9e655bae758f2b5ace064 to your computer and use it in GitHub Desktop.
Fuzzy Search JS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const MAX_UNICODE = 1114111 | |
| let bit_array = new Int32Array(MAX_UNICODE).fill(~0) | |
| /** | |
| * This algorithm determines if a given short string exists in a larger string with k errors. | |
| * It's useful for search-as-you-type features. | |
| * | |
| * @param {string} text | |
| * @param {string} pattern | |
| * @returns -1 when not found, otherwise the index of the first occurrence of the pattern in the text | |
| */ | |
| function bitap_search(text, pattern) { | |
| let len = pattern.length | |
| let R = ~1 | |
| if (len === 0 || len > text.length) { | |
| return -1 | |
| } | |
| bit_array.fill(~0) // reset | |
| for (let i = 0; i < len; i++) { | |
| bit_array[pattern.charCodeAt(i)] &= ~(1 << i) | |
| } | |
| for (let i = 0; i < text.length; i++) { | |
| R |= bit_array[text.charCodeAt(i)] | |
| R <<= 1 | |
| if ((R & (1 << len)) === 0) { | |
| return i - len + 1 | |
| } | |
| } | |
| return -1 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| The JavaScript implementation of version v0.2.0 of the __fts_fuzzy_match__ algorithm by Forrest Smith, | |
| updated by nrgwsth, to include the "exhaustive search" functionality | |
| described in the 2017 update to the _Reverse Engineering Sublime Text’s Fuzzy Match_ article. | |
| Copied from https://github.com/tajmone/fuzzy-search/blob/master/fts_fuzzy_match/0.2.0/js | |
| */ | |
| const BONUS_SEQUENTIAL = 15 // bonus for adjacent matches | |
| const BONUS_SEPARATOR = 30 // bonus if match occurs after a separator | |
| const BONUS_CAMEL = 30 // bonus if match is uppercase and prev is lower | |
| const BONUS_FIRST_LETTER = 15 // bonus if the first letter is matched | |
| const PENALTY_LEADING_LETTER = -5 // penalty applied for every letter in str before the first match | |
| const PENALTY_MAX_LEADING_LETTER = -15 // maximum penalty for leading letters | |
| const PENALTY_UNMATCHED_LETTER = -1 | |
| /** | |
| * Returns true if each character in pattern is found sequentially within str | |
| * | |
| * @param {string} pattern | |
| * @param {string} str | |
| * @returns {boolean} */ | |
| export function fuzzy_match_simple(pattern, str) { | |
| let pattern_idx = 0 | |
| let str_idx = 0 | |
| while (pattern_idx !== pattern.length && str_idx !== str.length) { | |
| const pattern_char = pattern.charAt(pattern_idx).toLowerCase() | |
| const str_char = str .charAt(str_idx) .toLowerCase() | |
| if (pattern_char === str_char) { | |
| pattern_idx += 1 | |
| } | |
| str_idx += 1 | |
| } | |
| return pattern.length !== 0 && str.length !== 0 && pattern_idx === pattern.length | |
| } | |
| /** | |
| * Does a fuzzy search to find pattern inside a string. | |
| * | |
| * @param {string} pattern pattern to search for | |
| * @param {string} str string which is being searched | |
| * @returns {[boolean, number]} a boolean which tells if pattern was | |
| * found or not and a search score | |
| */ | |
| export function fuzzy_match(pattern, str) { | |
| return _fuzzy_match( | |
| pattern, | |
| str, | |
| 0, /* pattern_cur_index */ | |
| 0, /* str_curr_index */ | |
| null, /* src_matces */ | |
| [], /* matches */ | |
| 256, /* max_matches */ | |
| 0, /* next_match */ | |
| 0, /* recursion_count */ | |
| 10, /* recursion_limit */ | |
| ); | |
| } | |
| /** | |
| * @param {string } pattern | |
| * @param {string } str | |
| * @param {number } pattern_cur_index | |
| * @param {number } str_curr_index | |
| * @param {number[] | null } src_matces | |
| * @param {number[] } matches | |
| * @param {number } max_matches | |
| * @param {number } next_match | |
| * @param {number } recursion_count | |
| * @param {number } recursion_limit | |
| * @returns {[boolean, number]} */ | |
| function _fuzzy_match( | |
| pattern, | |
| str, | |
| pattern_cur_index, | |
| str_curr_index, | |
| src_matces, | |
| matches, | |
| max_matches, | |
| next_match, | |
| recursion_count, | |
| recursion_limit, | |
| ) { | |
| let out_score = 0 | |
| // Return if recursion limit is reached. | |
| if (++recursion_count >= recursion_limit) { | |
| return [false, out_score] | |
| } | |
| // Return if we reached ends of strings. | |
| if (pattern_cur_index === pattern.length || str_curr_index === str.length) { | |
| return [false, out_score] | |
| } | |
| // Recursion params | |
| let recursive_match = false | |
| let best_recursive_matches = [] | |
| let best_recursive_score = 0 | |
| // Loop through pattern and str looking for a match. | |
| let first_match = true | |
| while (pattern_cur_index < pattern.length && str_curr_index < str.length) { | |
| // Match found. | |
| if ( | |
| pattern[pattern_cur_index].toLowerCase() === str[str_curr_index].toLowerCase() | |
| ) { | |
| if (next_match >= max_matches) { | |
| return [false, out_score] | |
| } | |
| if (first_match && src_matces) { | |
| matches = [...src_matces] | |
| first_match = false | |
| } | |
| let recursive_matches = [] | |
| const [matched, recursiveScore] = _fuzzy_match( | |
| pattern, | |
| str, | |
| pattern_cur_index, | |
| str_curr_index + 1, | |
| matches, | |
| recursive_matches, | |
| max_matches, | |
| next_match, | |
| recursion_count, | |
| recursion_limit | |
| ) | |
| if (matched) { | |
| // Pick best recursive score. | |
| if (!recursive_match || recursiveScore > best_recursive_score) { | |
| best_recursive_matches = [...recursive_matches] | |
| best_recursive_score = recursiveScore | |
| } | |
| recursive_match = true | |
| } | |
| matches[next_match++] = str_curr_index | |
| ++pattern_cur_index | |
| } | |
| ++str_curr_index | |
| } | |
| const matched = pattern_cur_index === pattern.length | |
| if (matched) { | |
| out_score = 100 | |
| // Apply leading letter penalty | |
| let penalty = PENALTY_LEADING_LETTER * matches[0] | |
| penalty = penalty < PENALTY_MAX_LEADING_LETTER | |
| ? PENALTY_MAX_LEADING_LETTER | |
| : penalty | |
| out_score += penalty | |
| // Apply unmatched penalty | |
| out_score += PENALTY_UNMATCHED_LETTER * (str.length - next_match) | |
| // Apply ordering bonuses | |
| for (let i = 0; i < next_match; i++) { | |
| const curr_idx = matches[i] | |
| if (i > 0) { | |
| const prev_idx = matches[i - 1] | |
| if (curr_idx == prev_idx + 1) { | |
| out_score += BONUS_SEQUENTIAL | |
| } | |
| } | |
| // Check for bonuses based on neighbor character value. | |
| if (curr_idx > 0) { | |
| // Camel case | |
| const neighbor = str[curr_idx - 1] | |
| const curr = str[curr_idx] | |
| if ( | |
| neighbor === neighbor.toLowerCase() && | |
| curr === curr.toUpperCase() | |
| ) { | |
| out_score += BONUS_CAMEL | |
| } | |
| // is neighbor separator | |
| if (neighbor == "_" || neighbor == " ") { | |
| out_score += BONUS_SEPARATOR | |
| } | |
| } else { | |
| // First letter | |
| out_score += BONUS_FIRST_LETTER | |
| } | |
| } | |
| // Return best result | |
| if (recursive_match && (!matched || best_recursive_score > out_score)) { | |
| // Recursive score is better than "this" | |
| matches = [...best_recursive_matches] | |
| out_score = best_recursive_score | |
| return [true, out_score] | |
| } else if (matched) { | |
| // "this" score is better than recursive | |
| return [true, out_score] | |
| } else { | |
| return [false, out_score] | |
| } | |
| } | |
| return [false, out_score] | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * calculates the minimum number of single-character edits \ | |
| * (insertions, deletions, or substitutions) \ | |
| * required to change one word into another. \ | |
| * It's a good choice for spell checking or auto-correct features. | |
| * | |
| * @param {string} a | |
| * @param {string} b | |
| * @returns {number} the Levenshtein distance between the two strings | |
| * - 0 if the strings are equal | |
| * - a positive integer if the strings are different | |
| */ | |
| function levenshtein_distance(a, b) { | |
| let mat = new Array(b.length + 1) | |
| for (let i = 0; i <= b.length; i += 1) { | |
| mat[i] = new Array(a.length + 1) | |
| mat[i][0] = i | |
| } | |
| for (let j = 0; j <= a.length; j += 1) { | |
| mat[0][j] = j | |
| } | |
| for (let i = 1; i <= b.length; i += 1) { | |
| for (let j = 1; j <= a.length; j += 1) { | |
| if (b[i-1] == a[j-1]) { | |
| mat[i][j] = mat[i-1][j-1] | |
| } else { | |
| mat[i][j] = Math.min( | |
| mat[i-1][j-1] + 1, // substitution | |
| mat[i ][j-1] + 1, // insertion | |
| mat[i-1][j ] + 1, // deletion | |
| ) | |
| } | |
| } | |
| } | |
| return mat[b.length][a.length] | |
| } | |
| /** | |
| * @param {string } query | |
| * @param {string[]} strings | |
| * @param {number } max_results | |
| * @returns indexes array, pointing to the strings array | |
| */ | |
| export function search_by_levenshtein_distance(query, strings, max_results) { | |
| /** @type {number[]} */ let indexes = new Array(max_results) | |
| /** @type {number[]} */ let scores = new Array(max_results) | |
| let len = 0 | |
| strings_loop: for (let str_idx = 0; str_idx < strings.length; str_idx += 1) { | |
| let string = strings[str_idx] | |
| let score = levenshtein_distance(string, query) | |
| for (let i = 0; i < len; i += 1) { | |
| if (score < scores[i]) { | |
| // shift to the right | |
| for (let j = len; j > i; j -= 1) { | |
| indexes[j] = indexes[j-1] | |
| scores [j] = scores [j-1] | |
| } | |
| scores [i] = score | |
| indexes[i] = str_idx | |
| continue strings_loop | |
| } | |
| } | |
| if (len < max_results) { | |
| scores [len] = score | |
| indexes[len] = str_idx | |
| len += 1 | |
| } | |
| } | |
| if (len < max_results) { | |
| indexes = indexes.slice(0, len) | |
| } | |
| return indexes | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment