Last active
September 24, 2025 12:56
-
-
Save skannan-maf/ecc0d124d1b6b98b071150ce08483dab to your computer and use it in GitHub Desktop.
Fuzzy match 2 list of strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ngram | |
| import re | |
| def coalesce(*args): | |
| for arg in args: | |
| if arg is not None: | |
| return arg | |
| return None | |
| def remove_non_alphanums(s): | |
| alnums = [] | |
| prev_was_space = False | |
| for ch in s: | |
| if ch.isalnum(): | |
| alnums.append(ch) | |
| prev_was_space = False | |
| else: | |
| if prev_was_space == False: | |
| alnums.append(' ') | |
| prev_was_space = True | |
| r = ''.join(alnums) | |
| return r.strip() | |
| def canonical_form(s, common_words): | |
| l = str.lower(s) | |
| for w in common_words: | |
| w_canon = str.lower(w) | |
| l_onlyalnums = remove_non_alphanums(l) | |
| l_words = l_onlyalnums.split(' ') | |
| #print(l_words) | |
| if w_canon in l_words: | |
| l = ' '.join([word for word in l_words if word != w_canon]) | |
| return l.strip() | |
| def fuzzy_match_list_of_strings( | |
| l1, | |
| l2, | |
| common_words=[], | |
| match_threshold=None, # No threshold | |
| verbose=True | |
| ): | |
| ''' | |
| Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument. | |
| Reports case-insensitive fuzzy matches in a dictionary! | |
| The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict) | |
| For those that did not match, report them in 2 separate lists | |
| First list that contains unmatched strings l1 | |
| Second for l2 | |
| Words in "common_words" argument are ignored while matching | |
| NOTE: | |
| All matches are case-insensitive | |
| ''' | |
| ngrams_N = 3 | |
| string_to_be_debugged = 'MISSION IMPOSSIBLE 8' | |
| new_l1 = [canonical_form(l, common_words) for l in l1] | |
| new_l2 = [canonical_form(l, common_words) for l in l2] | |
| def L1(l): | |
| return l1[new_l1.index(l)] | |
| def L2(l): | |
| return l2[new_l2.index(l)] | |
| # Compare items in l1 with l2 and produce matching dict | |
| def compare(l1, l2): | |
| d = {} | |
| for l in l1: | |
| best_l2_match = None | |
| best_l2_match_score = 0 | |
| for c in l2: | |
| this_score = ngram.NGram.compare(l, c, N=ngrams_N) | |
| if (l == canonical_form(string_to_be_debugged, common_words)) and (this_score >= 0.30): | |
| print('Matching {} With {} score = {}'.format(l, c, this_score)) | |
| if (this_score > best_l2_match_score) and (this_score > coalesce(match_threshold, 0)): | |
| best_l2_match = c | |
| best_l2_match_score = this_score | |
| if best_l2_match is not None: | |
| d[l] = best_l2_match | |
| if l == canonical_form(string_to_be_debugged, common_words): | |
| print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match)) | |
| return d | |
| l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1 | |
| l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2 | |
| # Compile results | |
| match_dict = {} | |
| unmatched_l1 = [] | |
| unmatched_l2 = [] | |
| for l in new_l1: | |
| found_match = False | |
| if l in l1_dict: | |
| # then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative | |
| if l2_dict[l1_dict[l]] == l: | |
| # Best match from both sides; Made for each other | |
| score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N) | |
| match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score} | |
| found_match = True | |
| else: | |
| if verbose: | |
| print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}') | |
| if found_match == False: | |
| unmatched_l1.append(L1(l)) | |
| for l in new_l2: | |
| found_match = False | |
| if l in l2_dict: | |
| # then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative | |
| if l1_dict[l2_dict[l]] == l: | |
| # Best match from both sides; Made for each other | |
| score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N) | |
| match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score} | |
| found_match = True | |
| else: | |
| if verbose: | |
| print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}') | |
| if found_match == False: | |
| unmatched_l2.append(L2(l)) | |
| return match_dict, unmatched_l1, unmatched_l2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment