This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import string, unicodedata , contractions , re | |
| from nltk.corpus import stopwords | |
| def remove_html_tags(text): | |
| return BeautifulSoup(text,"html.parser").get_text() | |
| def convert_unicode_data(text): | |
| return unicodedata.normalize("NFKD",text).encode("ascii","ignore").decode("utf-8","ignore") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.datasets import fetch_20newsgroups | |
| from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer | |
| from sklearn.naive_bayes import MultinomialNB | |
| categories= {'comp.graphics':'Graphics','rec.autos':'Auto', | |
| 'rec.motorcycles':'MotorCycle','rec.sport.baseball':'Baseball' | |
| ,'rec.sport.hockey':'Hockey', | |
| 'sci.space':'Space', | |
| 'talk.religion.misc': 'Religion'} |