Created
April 28, 2020 17:57
-
-
Save maxkleiner/81077421e1e26e7036034dbfc5d8abef to your computer and use it in GitHub Desktop.
nltk4.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "nltk4.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "authorship_tag": "ABX9TyNrx2irjCKrYGd1b4tiEqaj", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/maxkleiner/81077421e1e26e7036034dbfc5d8abef/nltk4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "NE9-2uRfoemj", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 110 | |
| }, | |
| "outputId": "4e5dad3f-145f-4b71-e492-9e9debe5cb03" | |
| }, | |
| "source": [ | |
| "import string\n", | |
| "\n", | |
| "from nltk.corpus import movie_reviews as mr\n", | |
| "from nltk.probability import FreqDist\n", | |
| "from nltk.corpus import stopwords\n", | |
| "import nltk\n", | |
| "nltk.download('stopwords')\n", | |
| "nltk.download('movie_reviews')\n" | |
| ], | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
| "[nltk_data] Package stopwords is already up-to-date!\n", | |
| "[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n", | |
| "[nltk_data] Package movie_reviews is already up-to-date!\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 8 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "t5083YQoort_", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "But the no. of unique words in a corpus can be very huge. We could restrict our model to extract features that are the most salient. But to do so we need to know what is the top N most frequent words in the corpus. \n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "FmHyiSbioxl5", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 111 | |
| }, | |
| "outputId": "9dacad1e-fc15-43de-92a7-f07a29f26a52" | |
| }, | |
| "source": [ | |
| "# nltk.download('stopwords')\n", | |
| "# nltk.download('movie_reviews')\n", | |
| "stop = stopwords.words('english') + list(string.punctuation)\n", | |
| "# This will extract key-value pairs, \n", | |
| "# where the keys are words and values are its count in the corpus.\n", | |
| "vocabulary = FreqDist(w.lower() for w in mr.words() if w.lower() not in stop)\n", | |
| "print(vocabulary.most_common(10))\n", | |
| "#print(list(vocabulary.keys())[:10])\n", | |
| "# the least uses words\n", | |
| "print(vocabulary.most_common()[-10:])\n", | |
| "\n", | |
| "#word_features = FreqDist(chain(*[i for i,j in documents]))\n", | |
| "word_features = list(vocabulary.keys())[:10]\n", | |
| "print(word_features)\n", | |
| "\n", | |
| "# vocabulary[1:50]\n", | |
| "#vocabulary\n", | |
| "\n", | |
| "documents = [([w for w in mr.words(i) if w.lower() not in stop], # Words in document.\n", | |
| " i.split('/')[0]) # Tag.\n", | |
| " for i in mr.fileids()]\n", | |
| "print(documents[0][:10]) # First document" | |
| ], | |
| "execution_count": 38, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]\n", | |
| "[('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1)]\n", | |
| "['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n", | |
| "(['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scenes', 'tons', 'weird', 'things', 'happen', 'simply', 'explained', 'personally', 'mind', 'trying', 'unravel', 'film', 'every', 'give', 'clue', 'get', 'kind', 'fed', 'film', 'biggest', 'problem', 'obviously', 'got', 'big', 'secret', 'hide', 'seems', 'want', 'hide', 'completely', 'final', 'five', 'minutes', 'make', 'things', 'entertaining', 'thrilling', 'even', 'engaging', 'meantime', 'really', 'sad', 'part', 'arrow', 'dig', 'flicks', 'like', 'actually', 'figured', 'half', 'way', 'point', 'strangeness', 'start', 'make', 'little', 'bit', 'sense', 'still', 'make', 'film', 'entertaining', 'guess', 'bottom', 'line', 'movies', 'like', 'always', 'make', 'sure', 'audience', 'even', 'given', 'secret', 'password', 'enter', 'world', 'understanding', 'mean', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'visions', '20', 'minutes', 'throughout', 'movie', 'plain', 'lazy', 'okay', 'get', 'people', 'chasing', 'know', 'really', 'need', 'see', 'giving', 'us', 'different', 'scenes', 'offering', 'insight', 'strangeness', 'going', 'movie', 'apparently', 'studio', 'took', 'film', 'away', 'director', 'chopped', 'shows', 'might', 'pretty', 'decent', 'teen', 'mind', 'fuck', 'movie', 'somewhere', 'guess', 'suits', 'decided', 'turning', 'music', 'video', 'little', 'edge', 'would', 'make', 'sense', 'actors', 'pretty', 'good', 'part', 'although', 'wes', 'bentley', 'seemed', 'playing', 'exact', 'character', 'american', 'beauty', 'new', 'neighborhood', 'biggest', 'kudos', 'go', 'sagemiller', 'holds', 'throughout', 'entire', 'film', 'actually', 'feeling', 'character', 'unraveling', 'overall', 'film', 'stick', 'entertain', 'confusing', 'rarely', 'excites', 'feels', 'pretty', 'redundant', 'runtime', 'despite', 'pretty', 'cool', 'ending', 'explanation', 'craziness', 'came', 'oh', 'way', 'horror', 'teen', 'slasher', 'flick', 'packaged', 'look', 'way', 'someone', 'apparently', 'assuming', 'genre', 'still', 'hot', 'kids', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'sitting', 'shelves', 'ever', 'since', 'whatever', 'skip', 'joblo', 'coming', 'nightmare', 'elm', 'street', '3', '7', '10', 'blair', 'witch', '2', '7', '10', 'crow', '9', '10', 'crow', 'salvation', '4', '10', 'lost', 'highway', '10', '10', 'memento', '10', '10', 'others', '9', '10', 'stir', 'echoes', '8', '10'], 'neg')\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "2EVJCUKV1WTT", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Classify the sentiment from NLTK to Scikit\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "PXWtb4WP1lGJ", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "from sklearn.feature_extraction.text import CountVectorizer\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "from sklearn.naive_bayes import BernoulliNB\n", | |
| "from sklearn import svm, datasets\n", | |
| "from sklearn.metrics import accuracy_score\n", | |
| "\n", | |
| "#X = mr.words\n", | |
| "#y = mr.tag" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "OeIdxog_6u6F", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "An NLTK's CategorizedPlaintextCorpusReader object isn't a dtype for pandas.\n", | |
| "\n", | |
| "That being said, you can convert the movie reviews into list of tuples and then populate a dataframe as such:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "FvSNA4WP6yk3", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 269 | |
| }, | |
| "outputId": "f64db87e-d8ec-420b-86fc-2d2bbd0ffa0e" | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "from nltk.corpus import movie_reviews as mr\n", | |
| "\n", | |
| "reviews = []\n", | |
| "for fileid in mr.fileids():\n", | |
| " tag, filename = fileid.split('/')\n", | |
| " reviews.append((filename, tag, mr.raw(fileid)))\n", | |
| "\n", | |
| "df = pd.DataFrame(reviews, columns=['filename', 'tag', 'text'])\n", | |
| "\n", | |
| "df.head(7)" | |
| ], | |
| "execution_count": 45, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>filename</th>\n", | |
| " <th>tag</th>\n", | |
| " <th>text</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>cv000_29416.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>plot : two teen couples go to a church party ,...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>cv001_19502.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>the happy bastard's quick movie review \\ndamn ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>cv002_17424.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>it is movies like these that make a jaded movi...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>cv003_12683.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>\" quest for camelot \" is warner bros . ' firs...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>cv004_12641.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>synopsis : a mentally unstable man undergoing ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>cv005_29357.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>capsule : in 2176 on the planet mars police ta...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>cv006_17022.txt</td>\n", | |
| " <td>neg</td>\n", | |
| " <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " filename tag text\n", | |
| "0 cv000_29416.txt neg plot : two teen couples go to a church party ,...\n", | |
| "1 cv001_19502.txt neg the happy bastard's quick movie review \\ndamn ...\n", | |
| "2 cv002_17424.txt neg it is movies like these that make a jaded movi...\n", | |
| "3 cv003_12683.txt neg \" quest for camelot \" is warner bros . ' firs...\n", | |
| "4 cv004_12641.txt neg synopsis : a mentally unstable man undergoing ...\n", | |
| "5 cv005_29357.txt neg capsule : in 2176 on the planet mars police ta...\n", | |
| "6 cv006_17022.txt neg so ask yourself what \" 8mm \" ( \" eight millime..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 45 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "ZVs4hKKy7tH_", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "We are now ready to classify\n", | |
| "The purpose is to use this data for sentiment analysis. while converting the data using pandas, glorious dataframe\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4bVJAex68By_", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 296 | |
| }, | |
| "outputId": "2b4ed716-57dc-4607-d288-2b7dc16be6bb" | |
| }, | |
| "source": [ | |
| "X = df['text']\n", | |
| "y = df['tag']\n", | |
| "\n", | |
| "# df= pd.read_csv('../input/movie-review/movie_review.csv') no need cause of df\n", | |
| "\n", | |
| "vect = CountVectorizer(ngram_range=(1, 2))\n", | |
| "\n", | |
| "X = vect.fit_transform(X)\n", | |
| "\n", | |
| "X_train, X_test, y_train, y_test = train_test_split(X, y)\n", | |
| "\n", | |
| "model = BernoulliNB(alpha=1.0)\n", | |
| "# model = svm.SVC(random_state=0)\n", | |
| "\n", | |
| "model.fit(X_train, y_train)\n", | |
| "\n", | |
| "p_train = model.predict(X_train)\n", | |
| "p_test = model.predict(X_test)\n", | |
| "\n", | |
| "acc_train = accuracy_score(y_train, p_train)\n", | |
| "acc_test = accuracy_score(y_test, p_test)\n", | |
| "\n", | |
| "print(f'Train ACC: {acc_train}, Test ACC: {acc_test}')\n", | |
| "\n", | |
| "from sklearn.metrics import confusion_matrix\n", | |
| "\n", | |
| "print(confusion_matrix(y_test, p_test, labels=['neg', 'pos']),'\\n')\n", | |
| "\n", | |
| "print(df.info())\n", | |
| "\n" | |
| ], | |
| "execution_count": 137, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Train ACC: 0.9986666666666667, Test ACC: 0.748\n", | |
| "[[254 10]\n", | |
| " [116 120]] \n", | |
| "\n", | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "RangeIndex: 2000 entries, 0 to 1999\n", | |
| "Data columns (total 3 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 filename 2000 non-null object\n", | |
| " 1 tag 2000 non-null object\n", | |
| " 2 text 2000 non-null object\n", | |
| "dtypes: object(3)\n", | |
| "memory usage: 47.0+ KB\n", | |
| "None\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "JtHwyH4JPmMI", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "In the end we check the distribution of negative and positive contribution words" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "DRcCpuiFP0v5", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 241 | |
| }, | |
| "outputId": "bb481593-8429-40f8-bba6-2d1206950ae4" | |
| }, | |
| "source": [ | |
| "from nltk.probability import FreqDist, ConditionalFreqDist\n", | |
| "word_fd = FreqDist()\n", | |
| "label_word_fd = ConditionalFreqDist()\n", | |
| "stops = stopwords.words('english') + list(string.punctuation)\n", | |
| "\n", | |
| "#polar = [w.lower() for w in mr.words() if w.lower() not in stops]\n", | |
| "\n", | |
| "def best_word_feats(words):\n", | |
| " return dict([(word, True) for word in words if word not in stops])\n", | |
| "\n", | |
| "all_words = [word.lower() for word in mr.words()]\n", | |
| "# print first 10 words\n", | |
| "print (all_words[:10])\n", | |
| "\n", | |
| "all_words_clean = []\n", | |
| "for word in all_words:\n", | |
| " if word not in stops:\n", | |
| " all_words_clean.append(word)\n", | |
| " \n", | |
| "print (all_words_clean[:10])\n", | |
| "print ('distinctive words: ',len(set(all_words_clean)))\n", | |
| "\n", | |
| "all_words_freq = FreqDist(all_words_clean)\n", | |
| "print (all_words_freq)\n", | |
| "\n", | |
| "# for word in best_word_feats(mr.words(categories=['neg'])):\n", | |
| "for word in mr.words(categories=['neg']):\n", | |
| " #word_fd.inc(word.lower())\n", | |
| " all_words_freq[word.lower()] +=1 \n", | |
| " label_word_fd['neg'][word.lower()] +=1\n", | |
| "print(all_words_freq.most_common()[:20])\n", | |
| "print(all_words_freq.most_common()[-20:])\n", | |
| " \n", | |
| "for word in mr.words(categories=['pos']):\n", | |
| " all_words_freq[word.lower()] += 1\n", | |
| " label_word_fd['pos'][word.lower()] += 1 \n", | |
| "print(label_word_fd['pos'].most_common()[:20])\n", | |
| "print(label_word_fd['pos'].most_common()[-20:]) \n", | |
| "\n", | |
| "neg_word_count = label_word_fd['neg'].N()\n", | |
| "pos_word_count = label_word_fd['pos'].N()\n", | |
| "\n", | |
| "# print 10 most frequently occurring words\n", | |
| "print (all_words_frequency.most_common(20))\n", | |
| "\n", | |
| "print(neg_word_count, pos_word_count)\n", | |
| "\n", | |
| "from matplotlib import pyplot as plt\n", | |
| "\n", | |
| "def feat_importances(coef, names):\n", | |
| " imp = coef\n", | |
| " imp,names = zip(*sorted(zip(imp,names)))\n", | |
| " plt.barh(range(len(names)), imp, align='center')\n", | |
| " plt.yticks(range(len(names)), names)\n", | |
| " plt.show()\n", | |
| "\n", | |
| "#feat_importances(model.coef_, features_names)\n", | |
| "#importances = model.feature_importances_\n", | |
| "print(vect.get_feature_names()[0:15])\n", | |
| "# pd.Series(abs(model.coef_[0]), index=vect.get_feature_names).nlargest(10).plot(kind='barh')\n", | |
| " " | |
| ], | |
| "execution_count": 143, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']\n", | |
| "['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n", | |
| "distinctive words: 39587\n", | |
| "<FreqDist with 39587 samples and 710579 outcomes>\n", | |
| "[(',', 35269), ('the', 35058), ('.', 32162), ('a', 17910), ('and', 15680), ('of', 15487), ('to', 15420), (\"'\", 15317), ('film', 13804), ('is', 11136), ('in', 10097), ('\"', 9120), ('movie', 9017), ('s', 8854), ('one', 8652), ('that', 7803), ('it', 7756), ('-', 7664), (')', 5742), ('(', 5650)]\n", | |
| "[('rattle', 1), ('unquestioned', 1), ('grittiest', 1), ('170', 1), ('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('overflying', 1), ('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('^', 1)]\n", | |
| "[(',', 42448), ('the', 41471), ('.', 33714), ('a', 20196), ('and', 19896), ('of', 18636), ('to', 16517), (\"'\", 15268), ('is', 14059), ('in', 11725), ('s', 9659), ('\"', 8492), ('it', 8351), ('that', 8121), ('-', 7931), ('as', 6478), (')', 6039), ('(', 6014), ('with', 5851), ('his', 5588)]\n", | |
| "[('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('persists', 1), ('communities', 1), ('overflying', 1), ('jams', 1), ('paneled', 1), ('vainly', 1), ('westworld', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('clicked', 1)]\n", | |
| "[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('film', 19034), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('one', 11704), ('(', 11664), ('movie', 11542)]\n", | |
| "751256 832564\n", | |
| "['00', '00 am', '00 feet', '00 for', '00 if', '00 showing', '00 sunday', '00 wasn', '000', '000 000', '000 acre', '000 and', '000 at', '000 bail', '000 before']\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment