delannoy · November 16, 2021 19:20
diff --git a/AutoCorrect.py b/AutoCorrect.py
 #!/usr/bin/env python3

 from lxml.html import fromstring
 from pandas import concat, DataFrame, Series
 from requests import get
 from string import ascii_uppercase

 def parseListOfCommonMisspellings(char:str) -> DataFrame:
    '''Scrape and parse [Wikipedia's list of common misspellings](https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings) and return as a pandas.DataFrame'''
    url = f'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/{char}'
    eTree = fromstring(get(url).content)
    items = Series(item for li in eTree.xpath('//li') if (item := li.text_content().strip()).endswith(')'))
    if len(items):
        items = items.str.extract(pat='(?P<misspelling>.*?) \((?P<correctSpellings>.*?)\)', expand=True)
        contains = items.correctSpellings.str.contains
        items = items[~contains('variant of') & ~contains('acceptable synonym') & ~contains('alternative spelling') & ~contains('acceptable spelling')].reset_index() # drop corrections which are variants/synonyms or alternative/acceptable spellings
        items.correctSpellings = items.correctSpellings.str.replace(pat='\[.*?\]', repl='', regex=True) # drop strings enclosed in square brackets, e.g. '[plural]'
        correctSpellings = items.correctSpellings.str.split('[,;]| or ', expand=True).apply(lambda x: x.str.strip()) # split multiple correct spellings (separated by commas, semicolons, or literal ' or ') into separate columns
        correctSpellings = correctSpellings.rename(columns={col: f'correctSpellings_{col}' for col in correctSpellings.columns})
        dropVerbose = correctSpellings['correctSpellings_0'].str.split(' ').str.len() - items.misspelling.str.split(' ').str.len() <= 2 # return false when a correct spelling (in the "main" column) is two words or longer than the misspelling
        return concat([items.misspelling[dropVerbose], correctSpellings[dropVerbose]], axis=1).apply(lambda x: x.str.strip()) # strip all leading/trailing whitespace

 def main():
    listOfCommonMisspellings = DataFrame()
    for char in ['0-9', *list(ascii_uppercase)]:
        print(char)
        listOfCommonMisspellings = concat([listOfCommonMisspellings, parseListOfCommonMisspellings(char)])
    return listOfCommonMisspellings

 if __name__ == '__main__':
    print(main().to_string())
	#!/usr/bin/env python3

	from lxml.html import fromstring
	from pandas import concat, DataFrame, Series
	from requests import get
	from string import ascii_uppercase

	def parseListOfCommonMisspellings(char:str) -> DataFrame:
	'''Scrape and parse [Wikipedia's list of common misspellings](https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings) and return as a pandas.DataFrame'''
	url = f'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/{char}'
	eTree = fromstring(get(url).content)
	items = Series(item for li in eTree.xpath('//li') if (item := li.text_content().strip()).endswith(')'))
	if len(items):
	items = items.str.extract(pat='(?P<misspelling>.?) \((?P<correctSpellings>.?)\)', expand=True)
	contains = items.correctSpellings.str.contains
	items = items[~contains('variant of') & ~contains('acceptable synonym') & ~contains('alternative spelling') & ~contains('acceptable spelling')].reset_index() # drop corrections which are variants/synonyms or alternative/acceptable spellings
	items.correctSpellings = items.correctSpellings.str.replace(pat='\[.*?\]', repl='', regex=True) # drop strings enclosed in square brackets, e.g. '[plural]'
	correctSpellings = items.correctSpellings.str.split('[,;]\| or ', expand=True).apply(lambda x: x.str.strip()) # split multiple correct spellings (separated by commas, semicolons, or literal ' or ') into separate columns
	correctSpellings = correctSpellings.rename(columns={col: f'correctSpellings_{col}' for col in correctSpellings.columns})
	dropVerbose = correctSpellings['correctSpellings_0'].str.split(' ').str.len() - items.misspelling.str.split(' ').str.len() <= 2 # return false when a correct spelling (in the "main" column) is two words or longer than the misspelling
	return concat([items.misspelling[dropVerbose], correctSpellings[dropVerbose]], axis=1).apply(lambda x: x.str.strip()) # strip all leading/trailing whitespace

	def main():
	listOfCommonMisspellings = DataFrame()
	for char in ['0-9', *list(ascii_uppercase)]:
	print(char)
	listOfCommonMisspellings = concat([listOfCommonMisspellings, parseListOfCommonMisspellings(char)])
	return listOfCommonMisspellings

	if __name__ == '__main__':
	print(main().to_string())
No results found