Skip to content

Instantly share code, notes, and snippets.

@matthewdeanmartin
Created November 13, 2025 13:43
Show Gist options
  • Select an option

  • Save matthewdeanmartin/e1570c4e589476f6b77c7b00cbe5bd10 to your computer and use it in GitHub Desktop.

Select an option

Save matthewdeanmartin/e1570c4e589476f6b77c7b00cbe5bd10 to your computer and use it in GitHub Desktop.
import re
import unicodedata
def strip_accents(s: str) -> str:
# NFD: decomposes "ï" -> "i" + "¨"
# Then we drop all combining marks (category "Mn")
return "".join(
c for c in unicodedata.normalize("NFD", s)
if unicodedata.category(c) != "Mn"
)
def findall_accent_insensitive(pattern: str, text: str):
norm_text = strip_accents(text)
norm_pattern = strip_accents(pattern)
regex = re.compile(norm_pattern)
return [text[m.start():m.end()] for m in regex.finditer(norm_text)]
txt = "j'ai croisé raïssa hier"
print(findall_accent_insensitive("rai", txt))
# -> ['raï']
# ref: https://elk.zone/mastodon.social/@[email protected]/115541819305981530
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment