skannan-maf · September 24, 2025 12:56
diff --git a/fuzzy_match_list_of_strings.py b/fuzzy_match_list_of_strings.py
 import ngram
 import re

 def coalesce(*args):
    for arg in args:
        if arg is not None:
            return arg
    return None

 def remove_non_alphanums(s):
    alnums = []
    prev_was_space = False
    for ch in s:
        if ch.isalnum():
            alnums.append(ch)
            prev_was_space = False
        else:
            if prev_was_space == False:
                alnums.append(' ')
                prev_was_space = True
    r = ''.join(alnums)
    return r.strip()
            

 def canonical_form(s, common_words):
    l = str.lower(s)
    for w in common_words:
        w_canon = str.lower(w)
        l_onlyalnums = remove_non_alphanums(l)
        l_words = l_onlyalnums.split(' ')
        #print(l_words)
        if w_canon in l_words:
            l = ' '.join([word for word in l_words if word != w_canon])
    return l.strip()
    
 def fuzzy_match_list_of_strings(
        l1, 
        l2, 
        common_words=[],
        match_threshold=None, # No threshold
        verbose=True
    ):
    '''
    Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument.
    Reports case-insensitive fuzzy matches in a dictionary!
    The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict)
    For those that did not match, report them in 2 separate lists
      First list that contains unmatched strings l1
      Second for l2
    Words in "common_words" argument are ignored while matching
    NOTE:
    All matches are case-insensitive
    '''
    ngrams_N = 3
    
    string_to_be_debugged = 'MISSION IMPOSSIBLE 8'
    
    new_l1 = [canonical_form(l, common_words) for l in  l1]
    new_l2 = [canonical_form(l, common_words) for l in l2]

    def L1(l):
        return l1[new_l1.index(l)]
    def L2(l):
        return l2[new_l2.index(l)]
    
   # Compare items in l1 with l2 and produce matching dict 
    def compare(l1, l2):
        d = {}
        for l in l1:
            best_l2_match = None
            best_l2_match_score = 0
            for c in l2:
                this_score = ngram.NGram.compare(l, c, N=ngrams_N)
                if (l == canonical_form(string_to_be_debugged, common_words)) and (this_score >= 0.30):
                    print('Matching {} With {} score = {}'.format(l, c, this_score))
                if (this_score > best_l2_match_score) and (this_score > coalesce(match_threshold, 0)):
                    best_l2_match = c
                    best_l2_match_score = this_score
            if best_l2_match is not None:
                d[l] = best_l2_match
                if l == canonical_form(string_to_be_debugged, common_words):
                    print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match))
        return d
    
    l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1
    l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2

    # Compile results
    
    match_dict = {}
    unmatched_l1 = []
    unmatched_l2 = []
    for l in new_l1:
        found_match = False
        if l in l1_dict:
            # then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative
            if l2_dict[l1_dict[l]] == l:
                # Best match from both sides; Made for each other
                score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N)
                match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score}
                found_match = True
            else:
                if verbose:
                    print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}')
        if found_match == False:
            unmatched_l1.append(L1(l))
    
    for l in new_l2:
        found_match = False
        if l in l2_dict:
            # then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative
            if l1_dict[l2_dict[l]] == l:
                # Best match from both sides; Made for each other
                score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N)
                match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score}
                found_match = True
            else:
                if verbose:
                    print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}')
        if found_match == False:
            unmatched_l2.append(L2(l))

    return match_dict, unmatched_l1, unmatched_l2
	import ngram
	import re

	def coalesce(*args):
	for arg in args:
	if arg is not None:
	return arg
	return None

	def remove_non_alphanums(s):
	alnums = []
	prev_was_space = False
	for ch in s:
	if ch.isalnum():
	alnums.append(ch)
	prev_was_space = False
	else:
	if prev_was_space == False:
	alnums.append(' ')
	prev_was_space = True
	r = ''.join(alnums)
	return r.strip()


	def canonical_form(s, common_words):
	l = str.lower(s)
	for w in common_words:
	w_canon = str.lower(w)
	l_onlyalnums = remove_non_alphanums(l)
	l_words = l_onlyalnums.split(' ')
	#print(l_words)
	if w_canon in l_words:
	l = ' '.join([word for word in l_words if word != w_canon])
	return l.strip()

	def fuzzy_match_list_of_strings(
	l1,
	l2,
	common_words=[],
	match_threshold=None, # No threshold
	verbose=True
	):
	'''
	Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument.
	Reports case-insensitive fuzzy matches in a dictionary!
	The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict)
	For those that did not match, report them in 2 separate lists
	First list that contains unmatched strings l1
	Second for l2
	Words in "common_words" argument are ignored while matching
	NOTE:
	All matches are case-insensitive
	'''
	ngrams_N = 3

	string_to_be_debugged = 'MISSION IMPOSSIBLE 8'

	new_l1 = [canonical_form(l, common_words) for l in l1]
	new_l2 = [canonical_form(l, common_words) for l in l2]

	def L1(l):
	return l1[new_l1.index(l)]
	def L2(l):
	return l2[new_l2.index(l)]

	# Compare items in l1 with l2 and produce matching dict
	def compare(l1, l2):
	d = {}
	for l in l1:
	best_l2_match = None
	best_l2_match_score = 0
	for c in l2:
	this_score = ngram.NGram.compare(l, c, N=ngrams_N)
	if (l == canonical_form(string_to_be_debugged, common_words)) and (this_score >= 0.30):
	print('Matching {} With {} score = {}'.format(l, c, this_score))
	if (this_score > best_l2_match_score) and (this_score > coalesce(match_threshold, 0)):
	best_l2_match = c
	best_l2_match_score = this_score
	if best_l2_match is not None:
	d[l] = best_l2_match
	if l == canonical_form(string_to_be_debugged, common_words):
	print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match))
	return d

	l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1
	l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2

	# Compile results

	match_dict = {}
	unmatched_l1 = []
	unmatched_l2 = []
	for l in new_l1:
	found_match = False
	if l in l1_dict:
	# then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative
	if l2_dict[l1_dict[l]] == l:
	# Best match from both sides; Made for each other
	score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N)
	match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score}
	found_match = True
	else:
	if verbose:
	print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}')
	if found_match == False:
	unmatched_l1.append(L1(l))

	for l in new_l2:
	found_match = False
	if l in l2_dict:
	# then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative
	if l1_dict[l2_dict[l]] == l:
	# Best match from both sides; Made for each other
	score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N)
	match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score}
	found_match = True
	else:
	if verbose:
	print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}')
	if found_match == False:
	unmatched_l2.append(L2(l))

	return match_dict, unmatched_l1, unmatched_l2
No results found