Created
November 7, 2025 02:10
-
-
Save Rishabh-creator601/8d449e3e244fcad4088ac57a502e5338 to your computer and use it in GitHub Desktop.
Regexp text cleaning in NLP (Natural language processing)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import string, unicodedata , contractions , re | |
| from nltk.corpus import stopwords | |
| def remove_html_tags(text): | |
| return BeautifulSoup(text,"html.parser").get_text() | |
| def convert_unicode_data(text): | |
| return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore") | |
| def remove_urls(text): | |
| text = re.sub('https?:\S*', ' ', text) | |
| text = re.sub('www?.\S*', ' ', text) | |
| return text | |
| def expand_contractions(text): | |
| expanded_words = [] | |
| for word in text : | |
| expanded_words.append(contractions.fix(word)) | |
| return "".join(expanded_words) | |
| def remove_mentions(text): | |
| text = re.sub(r'@\S*', ' ', text) | |
| text = re.sub(r'#\S*', ' ', text) | |
| return text | |
| def removing_special_chars(text): | |
| pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' | |
| return re.sub(pat,' ',text) | |
| def removing_numbers(text): | |
| pattern = r'[^a-zA-z.,!?/:;\"\'\s]' | |
| return re.sub(pattern,' ',text) | |
| def remove_punctuation(text): | |
| return ''.join([c for c in text if c not in string.punctuation]) | |
| ## final function | |
| def clean_text(text): | |
| text = text.lower() | |
| text = remove_html_tags(text) | |
| text = convert_unicode_data(text) | |
| text = remove_urls(text) | |
| text = expand_contractions(text) | |
| text = remove_mentions(text) | |
| text = removing_numbers(text) | |
| text = remove_punctuation(text) | |
| text = " ".join(w for w in text.split() if w not in STOPWORDS) | |
| return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment