maxkleiner · April 28, 2020 17:57
diff --git a/nltk4.ipynb b/nltk4.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "nltk4.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyNrx2irjCKrYGd1b4tiEqaj",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/maxkleiner/81077421e1e26e7036034dbfc5d8abef/nltk4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NE9-2uRfoemj",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 110
        },
        "outputId": "4e5dad3f-145f-4b71-e492-9e9debe5cb03"
      },
      "source": [
        "import string\n",
        "\n",
        "from nltk.corpus import movie_reviews as mr\n",
        "from nltk.probability import FreqDist\n",
        "from nltk.corpus import stopwords\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "nltk.download('movie_reviews')\n"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n",
            "[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n",
            "[nltk_data]   Package movie_reviews is already up-to-date!\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "t5083YQoort_",
        "colab_type": "text"
      },
      "source": [
        "But the no. of unique words in a corpus can be very huge. We could restrict our model to extract features that are the most salient. But to do so we need to know what is the top N most frequent words in the corpus. \n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FmHyiSbioxl5",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 111
        },
        "outputId": "9dacad1e-fc15-43de-92a7-f07a29f26a52"
      },
      "source": [
        "# nltk.download('stopwords')\n",
        "# nltk.download('movie_reviews')\n",
        "stop = stopwords.words('english') + list(string.punctuation)\n",
        "# This will extract key-value pairs, \n",
        "# where the keys are words and values are its count in the corpus.\n",
        "vocabulary = FreqDist(w.lower() for w in mr.words() if w.lower() not in stop)\n",
        "print(vocabulary.most_common(10))\n",
        "#print(list(vocabulary.keys())[:10])\n",
        "# the least uses words\n",
        "print(vocabulary.most_common()[-10:])\n",
        "\n",
        "#word_features = FreqDist(chain(*[i for i,j in documents]))\n",
        "word_features = list(vocabulary.keys())[:10]\n",
        "print(word_features)\n",
        "\n",
        "# vocabulary[1:50]\n",
        "#vocabulary\n",
        "\n",
        "documents = [([w for w in mr.words(i) if w.lower() not in stop], # Words in document.\n",
        "               i.split('/')[0]) # Tag.\n",
        "                for i in mr.fileids()]\n",
        "print(documents[0][:10]) # First document"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]\n",
            "[('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1)]\n",
            "['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
            "(['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scenes', 'tons', 'weird', 'things', 'happen', 'simply', 'explained', 'personally', 'mind', 'trying', 'unravel', 'film', 'every', 'give', 'clue', 'get', 'kind', 'fed', 'film', 'biggest', 'problem', 'obviously', 'got', 'big', 'secret', 'hide', 'seems', 'want', 'hide', 'completely', 'final', 'five', 'minutes', 'make', 'things', 'entertaining', 'thrilling', 'even', 'engaging', 'meantime', 'really', 'sad', 'part', 'arrow', 'dig', 'flicks', 'like', 'actually', 'figured', 'half', 'way', 'point', 'strangeness', 'start', 'make', 'little', 'bit', 'sense', 'still', 'make', 'film', 'entertaining', 'guess', 'bottom', 'line', 'movies', 'like', 'always', 'make', 'sure', 'audience', 'even', 'given', 'secret', 'password', 'enter', 'world', 'understanding', 'mean', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'visions', '20', 'minutes', 'throughout', 'movie', 'plain', 'lazy', 'okay', 'get', 'people', 'chasing', 'know', 'really', 'need', 'see', 'giving', 'us', 'different', 'scenes', 'offering', 'insight', 'strangeness', 'going', 'movie', 'apparently', 'studio', 'took', 'film', 'away', 'director', 'chopped', 'shows', 'might', 'pretty', 'decent', 'teen', 'mind', 'fuck', 'movie', 'somewhere', 'guess', 'suits', 'decided', 'turning', 'music', 'video', 'little', 'edge', 'would', 'make', 'sense', 'actors', 'pretty', 'good', 'part', 'although', 'wes', 'bentley', 'seemed', 'playing', 'exact', 'character', 'american', 'beauty', 'new', 'neighborhood', 'biggest', 'kudos', 'go', 'sagemiller', 'holds', 'throughout', 'entire', 'film', 'actually', 'feeling', 'character', 'unraveling', 'overall', 'film', 'stick', 'entertain', 'confusing', 'rarely', 'excites', 'feels', 'pretty', 'redundant', 'runtime', 'despite', 'pretty', 'cool', 'ending', 'explanation', 'craziness', 'came', 'oh', 'way', 'horror', 'teen', 'slasher', 'flick', 'packaged', 'look', 'way', 'someone', 'apparently', 'assuming', 'genre', 'still', 'hot', 'kids', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'sitting', 'shelves', 'ever', 'since', 'whatever', 'skip', 'joblo', 'coming', 'nightmare', 'elm', 'street', '3', '7', '10', 'blair', 'witch', '2', '7', '10', 'crow', '9', '10', 'crow', 'salvation', '4', '10', 'lost', 'highway', '10', '10', 'memento', '10', '10', 'others', '9', '10', 'stir', 'echoes', '8', '10'], 'neg')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2EVJCUKV1WTT",
        "colab_type": "text"
      },
      "source": [
        "Classify the sentiment from NLTK to Scikit\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PXWtb4WP1lGJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import pandas as pd\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.naive_bayes import BernoulliNB\n",
        "from sklearn import svm, datasets\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "#X = mr.words\n",
        "#y = mr.tag"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OeIdxog_6u6F",
        "colab_type": "text"
      },
      "source": [
        "An NLTK's CategorizedPlaintextCorpusReader object isn't a dtype for pandas.\n",
        "\n",
        "That being said, you can convert the movie reviews into list of tuples and then populate a dataframe as such:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FvSNA4WP6yk3",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 269
        },
        "outputId": "f64db87e-d8ec-420b-86fc-2d2bbd0ffa0e"
      },
      "source": [
        "import pandas as pd\n",
        "from nltk.corpus import movie_reviews as mr\n",
        "\n",
        "reviews = []\n",
        "for fileid in mr.fileids():\n",
        "    tag, filename = fileid.split('/')\n",
        "    reviews.append((filename, tag, mr.raw(fileid)))\n",
        "\n",
        "df = pd.DataFrame(reviews, columns=['filename', 'tag', 'text'])\n",
        "\n",
        "df.head(7)"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>filename</th>\n",
              "      <th>tag</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>cv000_29416.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>plot : two teen couples go to a church party ,...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>cv001_19502.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>the happy bastard's quick movie review \\ndamn ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>cv002_17424.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>it is movies like these that make a jaded movi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>cv003_12683.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>cv004_12641.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>cv005_29357.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>cv006_17022.txt</td>\n",
              "      <td>neg</td>\n",
              "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "          filename  tag                                               text\n",
              "0  cv000_29416.txt  neg  plot : two teen couples go to a church party ,...\n",
              "1  cv001_19502.txt  neg  the happy bastard's quick movie review \\ndamn ...\n",
              "2  cv002_17424.txt  neg  it is movies like these that make a jaded movi...\n",
              "3  cv003_12683.txt  neg   \" quest for camelot \" is warner bros . ' firs...\n",
              "4  cv004_12641.txt  neg  synopsis : a mentally unstable man undergoing ...\n",
              "5  cv005_29357.txt  neg  capsule : in 2176 on the planet mars police ta...\n",
              "6  cv006_17022.txt  neg  so ask yourself what \" 8mm \" ( \" eight millime..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 45
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZVs4hKKy7tH_",
        "colab_type": "text"
      },
      "source": [
        "We are now ready to classify\n",
        "The purpose is to use this data for sentiment analysis. while converting the data using pandas, glorious dataframe\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4bVJAex68By_",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 296
        },
        "outputId": "2b4ed716-57dc-4607-d288-2b7dc16be6bb"
      },
      "source": [
        "X = df['text']\n",
        "y = df['tag']\n",
        "\n",
        "# df= pd.read_csv('../input/movie-review/movie_review.csv') no need cause of df\n",
        "\n",
        "vect = CountVectorizer(ngram_range=(1, 2))\n",
        "\n",
        "X = vect.fit_transform(X)\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
        "\n",
        "model = BernoulliNB(alpha=1.0)\n",
        "# model = svm.SVC(random_state=0)\n",
        "\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "p_train = model.predict(X_train)\n",
        "p_test = model.predict(X_test)\n",
        "\n",
        "acc_train = accuracy_score(y_train, p_train)\n",
        "acc_test = accuracy_score(y_test, p_test)\n",
        "\n",
        "print(f'Train ACC: {acc_train}, Test ACC: {acc_test}')\n",
        "\n",
        "from sklearn.metrics import confusion_matrix\n",
        "\n",
        "print(confusion_matrix(y_test, p_test, labels=['neg', 'pos']),'\\n')\n",
        "\n",
        "print(df.info())\n",
        "\n"
      ],
      "execution_count": 137,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train ACC: 0.9986666666666667, Test ACC: 0.748\n",
            "[[254  10]\n",
            " [116 120]] \n",
            "\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 2000 entries, 0 to 1999\n",
            "Data columns (total 3 columns):\n",
            " #   Column    Non-Null Count  Dtype \n",
            "---  ------    --------------  ----- \n",
            " 0   filename  2000 non-null   object\n",
            " 1   tag       2000 non-null   object\n",
            " 2   text      2000 non-null   object\n",
            "dtypes: object(3)\n",
            "memory usage: 47.0+ KB\n",
            "None\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JtHwyH4JPmMI",
        "colab_type": "text"
      },
      "source": [
        "In the end we check the distribution of negative and positive contribution words"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DRcCpuiFP0v5",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 241
        },
        "outputId": "bb481593-8429-40f8-bba6-2d1206950ae4"
      },
      "source": [
        "from nltk.probability import FreqDist, ConditionalFreqDist\n",
        "word_fd = FreqDist()\n",
        "label_word_fd = ConditionalFreqDist()\n",
        "stops = stopwords.words('english') + list(string.punctuation)\n",
        "\n",
        "#polar = [w.lower() for w in mr.words() if w.lower() not in stops]\n",
        "\n",
        "def best_word_feats(words):\n",
        "    return dict([(word, True) for word in words if word not in stops])\n",
        "\n",
        "all_words = [word.lower() for word in mr.words()]\n",
        "# print first 10 words\n",
        "print (all_words[:10])\n",
        "\n",
        "all_words_clean = []\n",
        "for word in all_words:\n",
        "    if word not in stops:\n",
        "        all_words_clean.append(word)\n",
        " \n",
        "print (all_words_clean[:10])\n",
        "print ('distinctive words: ',len(set(all_words_clean)))\n",
        "\n",
        "all_words_freq = FreqDist(all_words_clean)\n",
        "print (all_words_freq)\n",
        "\n",
        "# for word in best_word_feats(mr.words(categories=['neg'])):\n",
        "for word in mr.words(categories=['neg']):\n",
        "    #word_fd.inc(word.lower())\n",
        "    all_words_freq[word.lower()] +=1 \n",
        "    label_word_fd['neg'][word.lower()] +=1\n",
        "print(all_words_freq.most_common()[:20])\n",
        "print(all_words_freq.most_common()[-20:])\n",
        " \n",
        "for word in mr.words(categories=['pos']):\n",
        "    all_words_freq[word.lower()] += 1\n",
        "    label_word_fd['pos'][word.lower()] += 1  \n",
        "print(label_word_fd['pos'].most_common()[:20])\n",
        "print(label_word_fd['pos'].most_common()[-20:]) \n",
        "\n",
        "neg_word_count = label_word_fd['neg'].N()\n",
        "pos_word_count = label_word_fd['pos'].N()\n",
        "\n",
        "# print 10 most frequently occurring words\n",
        "print (all_words_frequency.most_common(20))\n",
        "\n",
        "print(neg_word_count, pos_word_count)\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "\n",
        "def feat_importances(coef, names):\n",
        "    imp = coef\n",
        "    imp,names = zip(*sorted(zip(imp,names)))\n",
        "    plt.barh(range(len(names)), imp, align='center')\n",
        "    plt.yticks(range(len(names)), names)\n",
        "    plt.show()\n",
        "\n",
        "#feat_importances(model.coef_, features_names)\n",
        "#importances = model.feature_importances_\n",
        "print(vect.get_feature_names()[0:15])\n",
        "# pd.Series(abs(model.coef_[0]), index=vect.get_feature_names).nlargest(10).plot(kind='barh')\n",
        " "
      ],
      "execution_count": 143,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']\n",
            "['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
            "distinctive words:  39587\n",
            "<FreqDist with 39587 samples and 710579 outcomes>\n",
            "[(',', 35269), ('the', 35058), ('.', 32162), ('a', 17910), ('and', 15680), ('of', 15487), ('to', 15420), (\"'\", 15317), ('film', 13804), ('is', 11136), ('in', 10097), ('\"', 9120), ('movie', 9017), ('s', 8854), ('one', 8652), ('that', 7803), ('it', 7756), ('-', 7664), (')', 5742), ('(', 5650)]\n",
            "[('rattle', 1), ('unquestioned', 1), ('grittiest', 1), ('170', 1), ('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('overflying', 1), ('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('^', 1)]\n",
            "[(',', 42448), ('the', 41471), ('.', 33714), ('a', 20196), ('and', 19896), ('of', 18636), ('to', 16517), (\"'\", 15268), ('is', 14059), ('in', 11725), ('s', 9659), ('\"', 8492), ('it', 8351), ('that', 8121), ('-', 7931), ('as', 6478), (')', 6039), ('(', 6014), ('with', 5851), ('his', 5588)]\n",
            "[('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('persists', 1), ('communities', 1), ('overflying', 1), ('jams', 1), ('paneled', 1), ('vainly', 1), ('westworld', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('clicked', 1)]\n",
            "[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('film', 19034), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('one', 11704), ('(', 11664), ('movie', 11542)]\n",
            "751256 832564\n",
            "['00', '00 am', '00 feet', '00 for', '00 if', '00 showing', '00 sunday', '00 wasn', '000', '000 000', '000 acre', '000 and', '000 at', '000 bail', '000 before']\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "nltk4.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyNrx2irjCKrYGd1b4tiEqaj",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/maxkleiner/81077421e1e26e7036034dbfc5d8abef/nltk4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "NE9-2uRfoemj",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 110
	},
	"outputId": "4e5dad3f-145f-4b71-e492-9e9debe5cb03"
	},
	"source": [
	"import string\n",
	"\n",
	"from nltk.corpus import movie_reviews as mr\n",
	"from nltk.probability import FreqDist\n",
	"from nltk.corpus import stopwords\n",
	"import nltk\n",
	"nltk.download('stopwords')\n",
	"nltk.download('movie_reviews')\n"
	],
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
	"[nltk_data] Package stopwords is already up-to-date!\n",
	"[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n",
	"[nltk_data] Package movie_reviews is already up-to-date!\n"
	],
	"name": "stdout"
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"True"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 8
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "t5083YQoort_",
	"colab_type": "text"
	},
	"source": [
	"But the no. of unique words in a corpus can be very huge. We could restrict our model to extract features that are the most salient. But to do so we need to know what is the top N most frequent words in the corpus. \n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "FmHyiSbioxl5",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 111
	},
	"outputId": "9dacad1e-fc15-43de-92a7-f07a29f26a52"
	},
	"source": [
	"# nltk.download('stopwords')\n",
	"# nltk.download('movie_reviews')\n",
	"stop = stopwords.words('english') + list(string.punctuation)\n",
	"# This will extract key-value pairs, \n",
	"# where the keys are words and values are its count in the corpus.\n",
	"vocabulary = FreqDist(w.lower() for w in mr.words() if w.lower() not in stop)\n",
	"print(vocabulary.most_common(10))\n",
	"#print(list(vocabulary.keys())[:10])\n",
	"# the least uses words\n",
	"print(vocabulary.most_common()[-10:])\n",
	"\n",
	"#word_features = FreqDist(chain(*[i for i,j in documents]))\n",
	"word_features = list(vocabulary.keys())[:10]\n",
	"print(word_features)\n",
	"\n",
	"# vocabulary[1:50]\n",
	"#vocabulary\n",
	"\n",
	"documents = [([w for w in mr.words(i) if w.lower() not in stop], # Words in document.\n",
	" i.split('/')[0]) # Tag.\n",
	" for i in mr.fileids()]\n",
	"print(documents[0][:10]) # First document"
	],
	"execution_count": 38,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]\n",
	"[('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1)]\n",
	"['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
	"(['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scenes', 'tons', 'weird', 'things', 'happen', 'simply', 'explained', 'personally', 'mind', 'trying', 'unravel', 'film', 'every', 'give', 'clue', 'get', 'kind', 'fed', 'film', 'biggest', 'problem', 'obviously', 'got', 'big', 'secret', 'hide', 'seems', 'want', 'hide', 'completely', 'final', 'five', 'minutes', 'make', 'things', 'entertaining', 'thrilling', 'even', 'engaging', 'meantime', 'really', 'sad', 'part', 'arrow', 'dig', 'flicks', 'like', 'actually', 'figured', 'half', 'way', 'point', 'strangeness', 'start', 'make', 'little', 'bit', 'sense', 'still', 'make', 'film', 'entertaining', 'guess', 'bottom', 'line', 'movies', 'like', 'always', 'make', 'sure', 'audience', 'even', 'given', 'secret', 'password', 'enter', 'world', 'understanding', 'mean', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'visions', '20', 'minutes', 'throughout', 'movie', 'plain', 'lazy', 'okay', 'get', 'people', 'chasing', 'know', 'really', 'need', 'see', 'giving', 'us', 'different', 'scenes', 'offering', 'insight', 'strangeness', 'going', 'movie', 'apparently', 'studio', 'took', 'film', 'away', 'director', 'chopped', 'shows', 'might', 'pretty', 'decent', 'teen', 'mind', 'fuck', 'movie', 'somewhere', 'guess', 'suits', 'decided', 'turning', 'music', 'video', 'little', 'edge', 'would', 'make', 'sense', 'actors', 'pretty', 'good', 'part', 'although', 'wes', 'bentley', 'seemed', 'playing', 'exact', 'character', 'american', 'beauty', 'new', 'neighborhood', 'biggest', 'kudos', 'go', 'sagemiller', 'holds', 'throughout', 'entire', 'film', 'actually', 'feeling', 'character', 'unraveling', 'overall', 'film', 'stick', 'entertain', 'confusing', 'rarely', 'excites', 'feels', 'pretty', 'redundant', 'runtime', 'despite', 'pretty', 'cool', 'ending', 'explanation', 'craziness', 'came', 'oh', 'way', 'horror', 'teen', 'slasher', 'flick', 'packaged', 'look', 'way', 'someone', 'apparently', 'assuming', 'genre', 'still', 'hot', 'kids', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'sitting', 'shelves', 'ever', 'since', 'whatever', 'skip', 'joblo', 'coming', 'nightmare', 'elm', 'street', '3', '7', '10', 'blair', 'witch', '2', '7', '10', 'crow', '9', '10', 'crow', 'salvation', '4', '10', 'lost', 'highway', '10', '10', 'memento', '10', '10', 'others', '9', '10', 'stir', 'echoes', '8', '10'], 'neg')\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "2EVJCUKV1WTT",
	"colab_type": "text"
	},
	"source": [
	"Classify the sentiment from NLTK to Scikit\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "PXWtb4WP1lGJ",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import pandas as pd\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.naive_bayes import BernoulliNB\n",
	"from sklearn import svm, datasets\n",
	"from sklearn.metrics import accuracy_score\n",
	"\n",
	"#X = mr.words\n",
	"#y = mr.tag"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "OeIdxog_6u6F",
	"colab_type": "text"
	},
	"source": [
	"An NLTK's CategorizedPlaintextCorpusReader object isn't a dtype for pandas.\n",
	"\n",
	"That being said, you can convert the movie reviews into list of tuples and then populate a dataframe as such:"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "FvSNA4WP6yk3",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 269
	},
	"outputId": "f64db87e-d8ec-420b-86fc-2d2bbd0ffa0e"
	},
	"source": [
	"import pandas as pd\n",
	"from nltk.corpus import movie_reviews as mr\n",
	"\n",
	"reviews = []\n",
	"for fileid in mr.fileids():\n",
	" tag, filename = fileid.split('/')\n",
	" reviews.append((filename, tag, mr.raw(fileid)))\n",
	"\n",
	"df = pd.DataFrame(reviews, columns=['filename', 'tag', 'text'])\n",
	"\n",
	"df.head(7)"
	],
	"execution_count": 45,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>filename</th>\n",
	" <th>tag</th>\n",
	" <th>text</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>cv000_29416.txt</td>\n",
	" <td>neg</td>\n",
	" <td>plot : two teen couples go to a church party ,...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>cv001_19502.txt</td>\n",
	" <td>neg</td>\n",
	" <td>the happy bastard's quick movie review \\ndamn ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>cv002_17424.txt</td>\n",
	" <td>neg</td>\n",
	" <td>it is movies like these that make a jaded movi...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>cv003_12683.txt</td>\n",
	" <td>neg</td>\n",
	" <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>cv004_12641.txt</td>\n",
	" <td>neg</td>\n",
	" <td>synopsis : a mentally unstable man undergoing ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>cv005_29357.txt</td>\n",
	" <td>neg</td>\n",
	" <td>capsule : in 2176 on the planet mars police ta...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>cv006_17022.txt</td>\n",
	" <td>neg</td>\n",
	" <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" filename tag text\n",
	"0 cv000_29416.txt neg plot : two teen couples go to a church party ,...\n",
	"1 cv001_19502.txt neg the happy bastard's quick movie review \\ndamn ...\n",
	"2 cv002_17424.txt neg it is movies like these that make a jaded movi...\n",
	"3 cv003_12683.txt neg \" quest for camelot \" is warner bros . ' firs...\n",
	"4 cv004_12641.txt neg synopsis : a mentally unstable man undergoing ...\n",
	"5 cv005_29357.txt neg capsule : in 2176 on the planet mars police ta...\n",
	"6 cv006_17022.txt neg so ask yourself what \" 8mm \" ( \" eight millime..."
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 45
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "ZVs4hKKy7tH_",
	"colab_type": "text"
	},
	"source": [
	"We are now ready to classify\n",
	"The purpose is to use this data for sentiment analysis. while converting the data using pandas, glorious dataframe\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "4bVJAex68By_",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 296
	},
	"outputId": "2b4ed716-57dc-4607-d288-2b7dc16be6bb"
	},
	"source": [
	"X = df['text']\n",
	"y = df['tag']\n",
	"\n",
	"# df= pd.read_csv('../input/movie-review/movie_review.csv') no need cause of df\n",
	"\n",
	"vect = CountVectorizer(ngram_range=(1, 2))\n",
	"\n",
	"X = vect.fit_transform(X)\n",
	"\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
	"\n",
	"model = BernoulliNB(alpha=1.0)\n",
	"# model = svm.SVC(random_state=0)\n",
	"\n",
	"model.fit(X_train, y_train)\n",
	"\n",
	"p_train = model.predict(X_train)\n",
	"p_test = model.predict(X_test)\n",
	"\n",
	"acc_train = accuracy_score(y_train, p_train)\n",
	"acc_test = accuracy_score(y_test, p_test)\n",
	"\n",
	"print(f'Train ACC: {acc_train}, Test ACC: {acc_test}')\n",
	"\n",
	"from sklearn.metrics import confusion_matrix\n",
	"\n",
	"print(confusion_matrix(y_test, p_test, labels=['neg', 'pos']),'\\n')\n",
	"\n",
	"print(df.info())\n",
	"\n"
	],
	"execution_count": 137,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Train ACC: 0.9986666666666667, Test ACC: 0.748\n",
	"[[254 10]\n",
	" [116 120]] \n",
	"\n",
	"<class 'pandas.core.frame.DataFrame'>\n",
	"RangeIndex: 2000 entries, 0 to 1999\n",
	"Data columns (total 3 columns):\n",
	" # Column Non-Null Count Dtype \n",
	"--- ------ -------------- ----- \n",
	" 0 filename 2000 non-null object\n",
	" 1 tag 2000 non-null object\n",
	" 2 text 2000 non-null object\n",
	"dtypes: object(3)\n",
	"memory usage: 47.0+ KB\n",
	"None\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "JtHwyH4JPmMI",
	"colab_type": "text"
	},
	"source": [
	"In the end we check the distribution of negative and positive contribution words"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "DRcCpuiFP0v5",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 241
	},
	"outputId": "bb481593-8429-40f8-bba6-2d1206950ae4"
	},
	"source": [
	"from nltk.probability import FreqDist, ConditionalFreqDist\n",
	"word_fd = FreqDist()\n",
	"label_word_fd = ConditionalFreqDist()\n",
	"stops = stopwords.words('english') + list(string.punctuation)\n",
	"\n",
	"#polar = [w.lower() for w in mr.words() if w.lower() not in stops]\n",
	"\n",
	"def best_word_feats(words):\n",
	" return dict([(word, True) for word in words if word not in stops])\n",
	"\n",
	"all_words = [word.lower() for word in mr.words()]\n",
	"# print first 10 words\n",
	"print (all_words[:10])\n",
	"\n",
	"all_words_clean = []\n",
	"for word in all_words:\n",
	" if word not in stops:\n",
	" all_words_clean.append(word)\n",
	" \n",
	"print (all_words_clean[:10])\n",
	"print ('distinctive words: ',len(set(all_words_clean)))\n",
	"\n",
	"all_words_freq = FreqDist(all_words_clean)\n",
	"print (all_words_freq)\n",
	"\n",
	"# for word in best_word_feats(mr.words(categories=['neg'])):\n",
	"for word in mr.words(categories=['neg']):\n",
	" #word_fd.inc(word.lower())\n",
	" all_words_freq[word.lower()] +=1 \n",
	" label_word_fd['neg'][word.lower()] +=1\n",
	"print(all_words_freq.most_common()[:20])\n",
	"print(all_words_freq.most_common()[-20:])\n",
	" \n",
	"for word in mr.words(categories=['pos']):\n",
	" all_words_freq[word.lower()] += 1\n",
	" label_word_fd['pos'][word.lower()] += 1 \n",
	"print(label_word_fd['pos'].most_common()[:20])\n",
	"print(label_word_fd['pos'].most_common()[-20:]) \n",
	"\n",
	"neg_word_count = label_word_fd['neg'].N()\n",
	"pos_word_count = label_word_fd['pos'].N()\n",
	"\n",
	"# print 10 most frequently occurring words\n",
	"print (all_words_frequency.most_common(20))\n",
	"\n",
	"print(neg_word_count, pos_word_count)\n",
	"\n",
	"from matplotlib import pyplot as plt\n",
	"\n",
	"def feat_importances(coef, names):\n",
	" imp = coef\n",
	" imp,names = zip(*sorted(zip(imp,names)))\n",
	" plt.barh(range(len(names)), imp, align='center')\n",
	" plt.yticks(range(len(names)), names)\n",
	" plt.show()\n",
	"\n",
	"#feat_importances(model.coef_, features_names)\n",
	"#importances = model.feature_importances_\n",
	"print(vect.get_feature_names()[0:15])\n",
	"# pd.Series(abs(model.coef_[0]), index=vect.get_feature_names).nlargest(10).plot(kind='barh')\n",
	" "
	],
	"execution_count": 143,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']\n",
	"['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']\n",
	"distinctive words: 39587\n",
	"<FreqDist with 39587 samples and 710579 outcomes>\n",
	"[(',', 35269), ('the', 35058), ('.', 32162), ('a', 17910), ('and', 15680), ('of', 15487), ('to', 15420), (\"'\", 15317), ('film', 13804), ('is', 11136), ('in', 10097), ('\"', 9120), ('movie', 9017), ('s', 8854), ('one', 8652), ('that', 7803), ('it', 7756), ('-', 7664), (')', 5742), ('(', 5650)]\n",
	"[('rattle', 1), ('unquestioned', 1), ('grittiest', 1), ('170', 1), ('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('overflying', 1), ('paneled', 1), ('vainly', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('^', 1)]\n",
	"[(',', 42448), ('the', 41471), ('.', 33714), ('a', 20196), ('and', 19896), ('of', 18636), ('to', 16517), (\"'\", 15268), ('is', 14059), ('in', 11725), ('s', 9659), ('\"', 8492), ('it', 8351), ('that', 8121), ('-', 7931), ('as', 6478), (')', 6039), ('(', 6014), ('with', 5851), ('his', 5588)]\n",
	"[('discharge', 1), ('countrysides', 1), ('stafff', 1), ('downgrade', 1), ('persists', 1), ('communities', 1), ('overflying', 1), ('jams', 1), ('paneled', 1), ('vainly', 1), ('westworld', 1), ('snoots', 1), ('obstructions', 1), ('obscuring', 1), ('tangerine', 1), ('timbre', 1), ('powaqqatsi', 1), ('keyboardist', 1), ('capitalized', 1), ('clicked', 1)]\n",
	"[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('film', 19034), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('one', 11704), ('(', 11664), ('movie', 11542)]\n",
	"751256 832564\n",
	"['00', '00 am', '00 feet', '00 for', '00 if', '00 showing', '00 sunday', '00 wasn', '000', '000 000', '000 acre', '000 and', '000 at', '000 bail', '000 before']\n"
	],
	"name": "stdout"
	}
	]
	}
	]
	}
No results found