Rishabh-creator601 · November 7, 2025 02:10
diff --git a/clean_nlp.py b/clean_nlp.py
 from bs4 import BeautifulSoup 
 import string, unicodedata , contractions , re
 from nltk.corpus import stopwords 

 def remove_html_tags(text):
    return BeautifulSoup(text,"html.parser").get_text()

 def convert_unicode_data(text):
    return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore")

 def remove_urls(text):
    text =   re.sub('https?:\S*', ' ', text)
    text = re.sub('www?.\S*', ' ', text)
    return text 

 def expand_contractions(text):
    expanded_words  = []

    for word in text :
        expanded_words.append(contractions.fix(word))
    return "".join(expanded_words)

 def remove_mentions(text):
    text = re.sub(r'@\S*', ' ', text)
    text = re.sub(r'#\S*', ' ', text)
    return text

 def removing_special_chars(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat,' ',text)

 def removing_numbers(text):
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern,' ',text)

 def remove_punctuation(text):
    return ''.join([c for c in text if c not in string.punctuation])


 ## final function 
 def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    text = convert_unicode_data(text)
    text = remove_urls(text)
    text = expand_contractions(text)
    text = remove_mentions(text)
    text = removing_numbers(text)
    text = remove_punctuation(text)
    text = " ".join(w for w in text.split() if w not in STOPWORDS)
    return text
	from bs4 import BeautifulSoup
	import string, unicodedata , contractions , re
	from nltk.corpus import stopwords

	def remove_html_tags(text):
	return BeautifulSoup(text,"html.parser").get_text()

	def convert_unicode_data(text):
	return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore")

	def remove_urls(text):
	text = re.sub('https?:\S*', ' ', text)
	text = re.sub('www?.\S*', ' ', text)
	return text

	def expand_contractions(text):
	expanded_words = []

	for word in text :
	expanded_words.append(contractions.fix(word))
	return "".join(expanded_words)

	def remove_mentions(text):
	text = re.sub(r'@\S*', ' ', text)
	text = re.sub(r'#\S*', ' ', text)
	return text

	def removing_special_chars(text):
	pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
	return re.sub(pat,' ',text)

	def removing_numbers(text):
	pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
	return re.sub(pattern,' ',text)

	def remove_punctuation(text):
	return ''.join([c for c in text if c not in string.punctuation])


	## final function
	def clean_text(text):
	text = text.lower()
	text = remove_html_tags(text)
	text = convert_unicode_data(text)
	text = remove_urls(text)
	text = expand_contractions(text)
	text = remove_mentions(text)
	text = removing_numbers(text)
	text = remove_punctuation(text)
	text = " ".join(w for w in text.split() if w not in STOPWORDS)
	return text
No results found