This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| train, test = train_test_split(data, test_size = 0.3) | |
| cols = train.columns[:-1] | |
| gnb = MultinomialNB() | |
| gnb.fit(train[cols], train['sentiment']) | |
| y_pred = gnb.predict(test[cols]) | |
| print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%" | |
| .format( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pos_reviews = data[data['sentiment'] == 1] | |
| neg_reviews = data[data['sentiment'] == 0] | |
| pnum = np.array(pos_reviews[pos_reviews.columns].sum()) | |
| nnum = np.array(neg_reviews[ntg_reviews.columns].sum()) | |
| dif = pnum > nnum |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| word_matrix = [] | |
| for i in lemmatized: word_matrix.append([1 if j in i else 0 for j in top5000]) | |
| features = pd.DataFrame(word_matrix, columns = top5000, index = pd.DataFrame(filtered_tokens)) | |
| features['sentiment'] = data['sentiment'].values |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from operator import itemgetter | |
| from collections import Counter | |
| flat_list = [i for sublist in filtered_tokens for i in sublist] | |
| # Count how many times each word appears | |
| count = Counter(flat_list).items() | |
| sorted_count = sorted(count, key = itemgetter(1)) | |
| sorted_count.reverse() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| en_stopwords = list(set(nltk.corpus.stopwords.words('english'))) | |
| # remove punctuation from data | |
| clean = [re.sub(r'[^\w\s]','',i).lower() for i in data] | |
| tokens = [word_tokenize(x) for x in data['text']] | |
| filtered_tokens = [] | |
| # tokens that are not stopwords collected here | |
| for i in tokens: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = pd.DataFrame(columns=['text', 'sentiment']) | |
| for id in movie_reviews.fileids(): | |
| text = ' '.join(movie_reviews.words(id)) | |
| sentiment = 1 if movie_reviews.categories(id) == 'pos' else 0 | |
| data = data.append(pd.DataFrame({'text': text,'sentiment': sentiment}, index=[0])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| nltk.download('all') | |
| import regex as re | |
| import pandas as pd | |
| from sklearn.utils import shuffle | |
| from nltk import LancasterStemmer | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import movie_reviews, stopwords | |
| from sklearn.naive_bayes import MultinomialNB |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from matplotlib.animation import FuncAnimation | |
| p = np.linspace(-np.pi/2,np.pi/2,10) | |
| x = np.sin(p) | |
| v = np.column_stack((np.concatenate((x,x)),np.concatenate((np.cos(p),-np.cos(p))),[1]*len(p)*2)) |