Created
January 11, 2023 21:22
-
-
Save ltakens/caf01cd3018b8187b35687c84a245725 to your computer and use it in GitHub Desktop.
Permissive Python email regex allowing some common obfuscations. E.g. "(at)" and "(dot)".
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Here are some keywords one might use to find this email regex: | |
| - Python etract email addresses using regular expression | |
| - Python extract email from text | |
| - How to extract email addresses from a block of text using Python | |
| """ | |
| import doctest | |
| import re | |
| EMAIL_REGEX = re.compile( | |
| r'''( # start capture full address | |
| ([a-z0-9\.\+-_]|\(dot\)|\[dot\])+ # e.g. 'john_91[dot]goodman' | |
| (@|\(at\)|\[at\]) # '@' or '(at)' or '[at]' | |
| [\w-]+ # e.g. 'a-subdomain' | |
| ((\(dot\)|\[dot\]|\.)([a-z-_]+)){1,4} # e.g. 'domain.com.au' | |
| )''', # end capture full address | |
| re.VERBOSE | re.IGNORECASE | |
| ) | |
| def find_all_email_addresses(text: str) -> list: | |
| """Return all email addresses from text. | |
| >>> find_all_email_addresses( | |
| ... '''Hello [email protected], | |
| ... Are you still at john+goodman.is.awesome(at)his-domain.com.au? | |
| ... Or have you changed your email format to | |
| ... john[at]his(dot)domain.com[dot]au?''' | |
| ... ) | |
| ['[email protected]', '[email protected]', '[email protected]'] | |
| """ | |
| results = [] | |
| for addr, *_ in re.findall(EMAIL_REGEX, text): | |
| addr = re.sub('\(at\)|\[at\]', '@', addr, flags=re.IGNORECASE) | |
| addr = re.sub('\(dot\)|\[dot\]', '.', addr, flags=re.IGNORECASE) | |
| results.append(addr) | |
| return results | |
| if __name__ == '__main__': | |
| doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment