Last active
November 16, 2021 19:20
-
-
Save delannoy/08059a2b9e18202a1c039027867dff60 to your computer and use it in GitHub Desktop.
parse Wikipedia's list of common misspellings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| from lxml.html import fromstring | |
| from pandas import concat, DataFrame, Series | |
| from requests import get | |
| from string import ascii_uppercase | |
| def parseListOfCommonMisspellings(char:str) -> DataFrame: | |
| '''Scrape and parse [Wikipedia's list of common misspellings](https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings) and return as a pandas.DataFrame''' | |
| url = f'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/{char}' | |
| eTree = fromstring(get(url).content) | |
| items = Series(item for li in eTree.xpath('//li') if (item := li.text_content().strip()).endswith(')')) | |
| if len(items): | |
| items = items.str.extract(pat='(?P<misspelling>.*?) \((?P<correctSpellings>.*?)\)', expand=True) | |
| contains = items.correctSpellings.str.contains | |
| items = items[~contains('variant of') & ~contains('acceptable synonym') & ~contains('alternative spelling') & ~contains('acceptable spelling')].reset_index() # drop corrections which are variants/synonyms or alternative/acceptable spellings | |
| items.correctSpellings = items.correctSpellings.str.replace(pat='\[.*?\]', repl='', regex=True) # drop strings enclosed in square brackets, e.g. '[plural]' | |
| correctSpellings = items.correctSpellings.str.split('[,;]| or ', expand=True).apply(lambda x: x.str.strip()) # split multiple correct spellings (separated by commas, semicolons, or literal ' or ') into separate columns | |
| correctSpellings = correctSpellings.rename(columns={col: f'correctSpellings_{col}' for col in correctSpellings.columns}) | |
| dropVerbose = correctSpellings['correctSpellings_0'].str.split(' ').str.len() - items.misspelling.str.split(' ').str.len() <= 2 # return false when a correct spelling (in the "main" column) is two words or longer than the misspelling | |
| return concat([items.misspelling[dropVerbose], correctSpellings[dropVerbose]], axis=1).apply(lambda x: x.str.strip()) # strip all leading/trailing whitespace | |
| def main(): | |
| listOfCommonMisspellings = DataFrame() | |
| for char in ['0-9', *list(ascii_uppercase)]: | |
| print(char) | |
| listOfCommonMisspellings = concat([listOfCommonMisspellings, parseListOfCommonMisspellings(char)]) | |
| return listOfCommonMisspellings | |
| if __name__ == '__main__': | |
| print(main().to_string()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment