Skip to content

Instantly share code, notes, and snippets.

@Rishabh-creator601
Created November 7, 2025 02:10
Show Gist options
  • Select an option

  • Save Rishabh-creator601/8d449e3e244fcad4088ac57a502e5338 to your computer and use it in GitHub Desktop.

Select an option

Save Rishabh-creator601/8d449e3e244fcad4088ac57a502e5338 to your computer and use it in GitHub Desktop.
Regexp text cleaning in NLP (Natural language processing)
from bs4 import BeautifulSoup
import string, unicodedata , contractions , re
from nltk.corpus import stopwords
def remove_html_tags(text):
return BeautifulSoup(text,"html.parser").get_text()
def convert_unicode_data(text):
return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore")
def remove_urls(text):
text = re.sub('https?:\S*', ' ', text)
text = re.sub('www?.\S*', ' ', text)
return text
def expand_contractions(text):
expanded_words = []
for word in text :
expanded_words.append(contractions.fix(word))
return "".join(expanded_words)
def remove_mentions(text):
text = re.sub(r'@\S*', ' ', text)
text = re.sub(r'#\S*', ' ', text)
return text
def removing_special_chars(text):
pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
return re.sub(pat,' ',text)
def removing_numbers(text):
pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
return re.sub(pattern,' ',text)
def remove_punctuation(text):
return ''.join([c for c in text if c not in string.punctuation])
## final function
def clean_text(text):
text = text.lower()
text = remove_html_tags(text)
text = convert_unicode_data(text)
text = remove_urls(text)
text = expand_contractions(text)
text = remove_mentions(text)
text = removing_numbers(text)
text = remove_punctuation(text)
text = " ".join(w for w in text.split() if w not in STOPWORDS)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment