Skip to content

Instantly share code, notes, and snippets.

View Rishabh-creator601's full-sized avatar
🎯
Focusing

Rishabh Rishabh-creator601

🎯
Focusing
View GitHub Profile
@Rishabh-creator601
Rishabh-creator601 / clean_nlp.py
Created November 7, 2025 02:10
Regexp text cleaning in NLP (Natural language processing)
from bs4 import BeautifulSoup
import string, unicodedata , contractions , re
from nltk.corpus import stopwords
def remove_html_tags(text):
return BeautifulSoup(text,"html.parser").get_text()
def convert_unicode_data(text):
return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore")
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
categories= {'comp.graphics':'Graphics','rec.autos':'Auto',
'rec.motorcycles':'MotorCycle','rec.sport.baseball':'Baseball'
,'rec.sport.hockey':'Hockey',
'sci.space':'Space',
'talk.religion.misc': 'Religion'}