Skip to content

Instantly share code, notes, and snippets.

@maxkleiner
Created April 28, 2020 17:57
Show Gist options
  • Select an option

  • Save maxkleiner/81077421e1e26e7036034dbfc5d8abef to your computer and use it in GitHub Desktop.

Select an option

Save maxkleiner/81077421e1e26e7036034dbfc5d8abef to your computer and use it in GitHub Desktop.
nltk4.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "nltk4.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNrx2irjCKrYGd1b4tiEqaj",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/maxkleiner/81077421e1e26e7036034dbfc5d8abef/nltk4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "NE9-2uRfoemj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 110
},
"outputId": "4e5dad3f-145f-4b71-e492-9e9debe5cb03"
},
"source": [
"import string\n",
"\n",
"from nltk.corpus import movie_reviews as mr\n",
"from nltk.probability import FreqDist\n",
"from nltk.corpus import stopwords\n",
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('movie_reviews')\n"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n",
"[nltk_data] Package movie_reviews is already up-to-date!\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "t5083YQoort_",
"colab_type": "text"
},
"source": [
"But the no. of unique words in a corpus can be very huge. We could restrict our model to extract features that are the most salient. But to do so we need to know what is the top N most frequent words in the corpus. \n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "FmHyiSbioxl5",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 111
},
"outputId": "9dacad1e-fc15-43de-92a7-f07a29f26a52"
},
"source": [
"# nltk.download('stopwords')\n",
"# nltk.download('movie_reviews')\n",
"stop = stopwords.words('english') + list(string.punctuation)\n",
"# This will extract key-value pairs, \n",
"# where the keys are words and values are its count in the corpus.\n",
"vocabulary = FreqDist(w.lower() for w in mr.words() if w.lower() not in stop)\n",
"print(vocabulary.most_common(10))\n",
"#print(list(vocabulary.keys())[:10])\n",
"# the least uses words\n",
"print(vocabulary.most_common()[-10:])\n",
"\n",
"#word_features = FreqDist(chain(*[i for i,j in documents]))\n",
"word_features = list(vocabulary.keys())[:10]\n",
"print(word_features)\n",
"\n",
"# vocabulary[1:50]\n",
"#vocabulary\n",
"\n",
"documents = [([w for w in mr.words(i) if w.lower() not in stop], # Words in document.\n",
" i.split('/')[0]) # Tag.\n",
" for i in mr.fileids()]\n",
"print(documents[0][:10]) # First document"
],
"execution_count": 38,
"outputs": [
{
"output_type": "stream",
"text": [
"[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]\n",
"[('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1)]\n",
"['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
"(['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scenes', 'tons', 'weird', 'things', 'happen', 'simply', 'explained', 'personally', 'mind', 'trying', 'unravel', 'film', 'every', 'give', 'clue', 'get', 'kind', 'fed', 'film', 'biggest', 'problem', 'obviously', 'got', 'big', 'secret', 'hide', 'seems', 'want', 'hide', 'completely', 'final', 'five', 'minutes', 'make', 'things', 'entertaining', 'thrilling', 'even', 'engaging', 'meantime', 'really', 'sad', 'part', 'arrow', 'dig', 'flicks', 'like', 'actually', 'figured', 'half', 'way', 'point', 'strangeness', 'start', 'make', 'little', 'bit', 'sense', 'still', 'make', 'film', 'entertaining', 'guess', 'bottom', 'line', 'movies', 'like', 'always', 'make', 'sure', 'audience', 'even', 'given', 'secret', 'password', 'enter', 'world', 'understanding', 'mean', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'visions', '20', 'minutes', 'throughout', 'movie', 'plain', 'lazy', 'okay', 'get', 'people', 'chasing', 'know', 'really', 'need', 'see', 'giving', 'us', 'different', 'scenes', 'offering', 'insight', 'strangeness', 'going', 'movie', 'apparently', 'studio', 'took', 'film', 'away', 'director', 'chopped', 'shows', 'might', 'pretty', 'decent', 'teen', 'mind', 'fuck', 'movie', 'somewhere', 'guess', 'suits', 'decided', 'turning', 'music', 'video', 'little', 'edge', 'would', 'make', 'sense', 'actors', 'pretty', 'good', 'part', 'although', 'wes', 'bentley', 'seemed', 'playing', 'exact', 'character', 'american', 'beauty', 'new', 'neighborhood', 'biggest', 'kudos', 'go', 'sagemiller', 'holds', 'throughout', 'entire', 'film', 'actually', 'feeling', 'character', 'unraveling', 'overall', 'film', 'stick', 'entertain', 'confusing', 'rarely', 'excites', 'feels', 'pretty', 'redundant', 'runtime', 'despite', 'pretty', 'cool', 'ending', 'explanation', 'craziness', 'came', 'oh', 'way', 'horror', 'teen', 'slasher', 'flick', 'packaged', 'look', 'way', 'someone', 'apparently', 'assuming', 'genre', 'still', 'hot', 'kids', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'sitting', 'shelves', 'ever', 'since', 'whatever', 'skip', 'joblo', 'coming', 'nightmare', 'elm', 'street', '3', '7', '10', 'blair', 'witch', '2', '7', '10', 'crow', '9', '10', 'crow', 'salvation', '4', '10', 'lost', 'highway', '10', '10', 'memento', '10', '10', 'others', '9', '10', 'stir', 'echoes', '8', '10'], 'neg')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2EVJCUKV1WTT",
"colab_type": "text"
},
"source": [
"Classify the sentiment from NLTK to Scikit\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PXWtb4WP1lGJ",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import BernoulliNB\n",
"from sklearn import svm, datasets\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"#X = mr.words\n",
"#y = mr.tag"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "OeIdxog_6u6F",
"colab_type": "text"
},
"source": [
"An NLTK's CategorizedPlaintextCorpusReader object isn't a dtype for pandas.\n",
"\n",
"That being said, you can convert the movie reviews into list of tuples and then populate a dataframe as such:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "FvSNA4WP6yk3",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 269
},
"outputId": "f64db87e-d8ec-420b-86fc-2d2bbd0ffa0e"
},
"source": [
"import pandas as pd\n",
"from nltk.corpus import movie_reviews as mr\n",
"\n",
"reviews = []\n",
"for fileid in mr.fileids():\n",
" tag, filename = fileid.split('/')\n",
" reviews.append((filename, tag, mr.raw(fileid)))\n",
"\n",
"df = pd.DataFrame(reviews, columns=['filename', 'tag', 'text'])\n",
"\n",
"df.head(7)"
],
"execution_count": 45,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>tag</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>cv000_29416.txt</td>\n",
" <td>neg</td>\n",
" <td>plot : two teen couples go to a church party ,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>cv001_19502.txt</td>\n",
" <td>neg</td>\n",
" <td>the happy bastard's quick movie review \\ndamn ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>cv002_17424.txt</td>\n",
" <td>neg</td>\n",
" <td>it is movies like these that make a jaded movi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cv003_12683.txt</td>\n",
" <td>neg</td>\n",
" <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>cv004_12641.txt</td>\n",
" <td>neg</td>\n",
" <td>synopsis : a mentally unstable man undergoing ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>cv005_29357.txt</td>\n",
" <td>neg</td>\n",
" <td>capsule : in 2176 on the planet mars police ta...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>cv006_17022.txt</td>\n",
" <td>neg</td>\n",
" <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename tag text\n",
"0 cv000_29416.txt neg plot : two teen couples go to a church party ,...\n",
"1 cv001_19502.txt neg the happy bastard's quick movie review \\ndamn ...\n",
"2 cv002_17424.txt neg it is movies like these that make a jaded movi...\n",
"3 cv003_12683.txt neg \" quest for camelot \" is warner bros . ' firs...\n",
"4 cv004_12641.txt neg synopsis : a mentally unstable man undergoing ...\n",
"5 cv005_29357.txt neg capsule : in 2176 on the planet mars police ta...\n",
"6 cv006_17022.txt neg so ask yourself what \" 8mm \" ( \" eight millime..."
]
},
"metadata": {
"tags": []
},
"execution_count": 45
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZVs4hKKy7tH_",
"colab_type": "text"
},
"source": [
"We are now ready to classify\n",
"The purpose is to use this data for sentiment analysis. while converting the data using pandas, glorious dataframe\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4bVJAex68By_",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 296
},
"outputId": "2b4ed716-57dc-4607-d288-2b7dc16be6bb"
},
"source": [
"X = df['text']\n",
"y = df['tag']\n",
"\n",
"# df= pd.read_csv('../input/movie-review/movie_review.csv') no need cause of df\n",
"\n",
"vect = CountVectorizer(ngram_range=(1, 2))\n",
"\n",
"X = vect.fit_transform(X)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
"\n",
"model = BernoulliNB(alpha=1.0)\n",
"# model = svm.SVC(random_state=0)\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"p_train = model.predict(X_train)\n",
"p_test = model.predict(X_test)\n",
"\n",
"acc_train = accuracy_score(y_train, p_train)\n",
"acc_test = accuracy_score(y_test, p_test)\n",
"\n",
"print(f'Train ACC: {acc_train}, Test ACC: {acc_test}')\n",
"\n",
"from sklearn.metrics import confusion_matrix\n",
"\n",
"print(confusion_matrix(y_test, p_test, labels=['neg', 'pos']),'\\n')\n",
"\n",
"print(df.info())\n",
"\n"
],
"execution_count": 137,
"outputs": [
{
"output_type": "stream",
"text": [
"Train ACC: 0.9986666666666667, Test ACC: 0.748\n",
"[[254 10]\n",
" [116 120]] \n",
"\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2000 entries, 0 to 1999\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 filename 2000 non-null object\n",
" 1 tag 2000 non-null object\n",
" 2 text 2000 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 47.0+ KB\n",
"None\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "JtHwyH4JPmMI",
"colab_type": "text"
},
"source": [
"In the end we check the distribution of negative and positive contribution words"
]
},
{
"cell_type": "code",
"metadata": {
"id": "DRcCpuiFP0v5",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"outputId": "bb481593-8429-40f8-bba6-2d1206950ae4"
},
"source": [
"from nltk.probability import FreqDist, ConditionalFreqDist\n",
"word_fd = FreqDist()\n",
"label_word_fd = ConditionalFreqDist()\n",
"stops = stopwords.words('english') + list(string.punctuation)\n",
"\n",
"#polar = [w.lower() for w in mr.words() if w.lower() not in stops]\n",
"\n",
"def best_word_feats(words):\n",
" return dict([(word, True) for word in words if word not in stops])\n",
"\n",
"all_words = [word.lower() for word in mr.words()]\n",
"# print first 10 words\n",
"print (all_words[:10])\n",
"\n",
"all_words_clean = []\n",
"for word in all_words:\n",
" if word not in stops:\n",
" all_words_clean.append(word)\n",
" \n",
"print (all_words_clean[:10])\n",
"print ('distinctive words: ',len(set(all_words_clean)))\n",
"\n",
"all_words_freq = FreqDist(all_words_clean)\n",
"print (all_words_freq)\n",
"\n",
"# for word in best_word_feats(mr.words(categories=['neg'])):\n",
"for word in mr.words(categories=['neg']):\n",
" #word_fd.inc(word.lower())\n",
" all_words_freq[word.lower()] +=1 \n",
" label_word_fd['neg'][word.lower()] +=1\n",
"print(all_words_freq.most_common()[:20])\n",
"print(all_words_freq.most_common()[-20:])\n",
" \n",
"for word in mr.words(categories=['pos']):\n",
" all_words_freq[word.lower()] += 1\n",
" label_word_fd['pos'][word.lower()] += 1 \n",
"print(label_word_fd['pos'].most_common()[:20])\n",
"print(label_word_fd['pos'].most_common()[-20:]) \n",
"\n",
"neg_word_count = label_word_fd['neg'].N()\n",
"pos_word_count = label_word_fd['pos'].N()\n",
"\n",
"# print 10 most frequently occurring words\n",
"print (all_words_frequency.most_common(20))\n",
"\n",
"print(neg_word_count, pos_word_count)\n",
"\n",
"from matplotlib import pyplot as plt\n",
"\n",
"def feat_importances(coef, names):\n",
" imp = coef\n",
" imp,names = zip(*sorted(zip(imp,names)))\n",
" plt.barh(range(len(names)), imp, align='center')\n",
" plt.yticks(range(len(names)), names)\n",
" plt.show()\n",
"\n",
"#feat_importances(model.coef_, features_names)\n",
"#importances = model.feature_importances_\n",
"print(vect.get_feature_names()[0:15])\n",
"# pd.Series(abs(model.coef_[0]), index=vect.get_feature_names).nlargest(10).plot(kind='barh')\n",
" "
],
"execution_count": 143,
"outputs": [
{
"output_type": "stream",
"text": [
"['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']\n",
"['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
"distinctive words: 39587\n",
"<FreqDist with 39587 samples and 710579 outcomes>\n",
"[(',', 35269), ('the', 35058), ('.', 32162), ('a', 17910), ('and', 15680), ('of', 15487), ('to', 15420), (\"'\", 15317), ('film', 13804), ('is', 11136), ('in', 10097), ('\"', 9120), ('movie', 9017), ('s', 8854), ('one', 8652), ('that', 7803), ('it', 7756), ('-', 7664), (')', 5742), ('(', 5650)]\n",
"[('rattle', 1), ('unquestioned', 1), ('grittiest', 1), ('170', 1), ('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('overflying', 1), ('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('^', 1)]\n",
"[(',', 42448), ('the', 41471), ('.', 33714), ('a', 20196), ('and', 19896), ('of', 18636), ('to', 16517), (\"'\", 15268), ('is', 14059), ('in', 11725), ('s', 9659), ('\"', 8492), ('it', 8351), ('that', 8121), ('-', 7931), ('as', 6478), (')', 6039), ('(', 6014), ('with', 5851), ('his', 5588)]\n",
"[('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('persists', 1), ('communities', 1), ('overflying', 1), ('jams', 1), ('paneled', 1), ('vainly', 1), ('westworld', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('clicked', 1)]\n",
"[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('film', 19034), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('one', 11704), ('(', 11664), ('movie', 11542)]\n",
"751256 832564\n",
"['00', '00 am', '00 feet', '00 for', '00 if', '00 showing', '00 sunday', '00 wasn', '000', '000 000', '000 acre', '000 and', '000 at', '000 bail', '000 before']\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment