Skip to content

Instantly share code, notes, and snippets.

@delannoy
Last active November 16, 2021 19:20
Show Gist options
  • Select an option

  • Save delannoy/08059a2b9e18202a1c039027867dff60 to your computer and use it in GitHub Desktop.

Select an option

Save delannoy/08059a2b9e18202a1c039027867dff60 to your computer and use it in GitHub Desktop.
parse Wikipedia's list of common misspellings
#!/usr/bin/env python3
from lxml.html import fromstring
from pandas import concat, DataFrame, Series
from requests import get
from string import ascii_uppercase
def parseListOfCommonMisspellings(char:str) -> DataFrame:
'''Scrape and parse [Wikipedia's list of common misspellings](https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings) and return as a pandas.DataFrame'''
url = f'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/{char}'
eTree = fromstring(get(url).content)
items = Series(item for li in eTree.xpath('//li') if (item := li.text_content().strip()).endswith(')'))
if len(items):
items = items.str.extract(pat='(?P<misspelling>.*?) \((?P<correctSpellings>.*?)\)', expand=True)
contains = items.correctSpellings.str.contains
items = items[~contains('variant of') & ~contains('acceptable synonym') & ~contains('alternative spelling') & ~contains('acceptable spelling')].reset_index() # drop corrections which are variants/synonyms or alternative/acceptable spellings
items.correctSpellings = items.correctSpellings.str.replace(pat='\[.*?\]', repl='', regex=True) # drop strings enclosed in square brackets, e.g. '[plural]'
correctSpellings = items.correctSpellings.str.split('[,;]| or ', expand=True).apply(lambda x: x.str.strip()) # split multiple correct spellings (separated by commas, semicolons, or literal ' or ') into separate columns
correctSpellings = correctSpellings.rename(columns={col: f'correctSpellings_{col}' for col in correctSpellings.columns})
dropVerbose = correctSpellings['correctSpellings_0'].str.split(' ').str.len() - items.misspelling.str.split(' ').str.len() <= 2 # return false when a correct spelling (in the "main" column) is two words or longer than the misspelling
return concat([items.misspelling[dropVerbose], correctSpellings[dropVerbose]], axis=1).apply(lambda x: x.str.strip()) # strip all leading/trailing whitespace
def main():
listOfCommonMisspellings = DataFrame()
for char in ['0-9', *list(ascii_uppercase)]:
print(char)
listOfCommonMisspellings = concat([listOfCommonMisspellings, parseListOfCommonMisspellings(char)])
return listOfCommonMisspellings
if __name__ == '__main__':
print(main().to_string())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment