Created
April 10, 2017 16:01
-
-
Save dspp779/5a9597e2d8a2518b80fb0ad191ea8463 to your computer and use it in GitHub Desktop.
Text classification with SVM example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Lab 7: Text Classification with SVM" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.datasets import load_files\n", | |
| "movie_reviews_data_folder = 'movie_reviews/txt_sentoken'\n", | |
| "dataset = load_files(movie_reviews_data_folder, shuffle=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Labels" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['neg', 'pos']" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dataset.target_names" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Training data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['movie_reviews/txt_sentoken/neg/cv000_29416.txt',\n", | |
| " 'movie_reviews/txt_sentoken/neg/cv001_19502.txt',\n", | |
| " 'movie_reviews/txt_sentoken/neg/cv002_17424.txt', ...,\n", | |
| " 'movie_reviews/txt_sentoken/pos/cv997_5046.txt',\n", | |
| " 'movie_reviews/txt_sentoken/pos/cv998_14111.txt',\n", | |
| " 'movie_reviews/txt_sentoken/pos/cv999_13106.txt'], \n", | |
| " dtype='<U46')" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dataset.filenames" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2000" | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(dataset.data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Split data into train data and test data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.model_selection import train_test_split\n", | |
| "docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "TASK: Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.pipeline import Pipeline\n", | |
| "from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
| "from sklearn.svm import SVC, LinearSVC, NuSVC\n", | |
| "\n", | |
| "# your code here ..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "pipeline = Pipeline([\n", | |
| " ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),\n", | |
| " ('clf', LinearSVC(C=1000)),\n", | |
| "])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "TASK: Build a grid search to find out whether unigrams or bigrams are more useful.\n", | |
| "\n", | |
| "Fit the pipeline on the training set using grid search for the parameters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.model_selection import GridSearchCV\n", | |
| "\n", | |
| "# your code here ..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "GridSearchCV(cv=None, error_score='raise',\n", | |
| " estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
| " dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
| " lowercase=True, max_df=0.95, max_features=None, min_df=3,\n", | |
| " ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n", | |
| " ...ax_iter=1000,\n", | |
| " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", | |
| " verbose=0))]),\n", | |
| " fit_params={}, iid=True, n_jobs=-1,\n", | |
| " param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},\n", | |
| " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", | |
| " scoring=None, verbose=0)" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "parameters = {\n", | |
| " 'vect__ngram_range': [(1, 1), (1, 2)],\n", | |
| "}\n", | |
| "grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)\n", | |
| "grid_search.fit(docs_train, y_train)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Print the mean and std for each candidate along with the parameter settings for all the candidates explored by grid search." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "0 params - {'vect__ngram_range': (1, 1)}; mean - 0.84; std - 0.00\n", | |
| "1 params - {'vect__ngram_range': (1, 2)}; mean - 0.85; std - 0.02\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "n_candidates = len(grid_search.cv_results_['params'])\n", | |
| "for i in range(n_candidates):\n", | |
| " print(i, 'params - %s; mean - %0.2f; std - %0.2f'\n", | |
| " % (grid_search.cv_results_['params'][i],\n", | |
| " grid_search.cv_results_['mean_test_score'][i],\n", | |
| " grid_search.cv_results_['std_test_score'][i]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Predict the outcome on the testing set and store it in a variable named y_predicted" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "y_predicted = grid_search.predict(docs_test)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Print the classification report" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " neg 0.85 0.82 0.83 256\n", | |
| " pos 0.82 0.85 0.83 244\n", | |
| "\n", | |
| "avg / total 0.83 0.83 0.83 500\n", | |
| "\n", | |
| "[[210 46]\n", | |
| " [ 37 207]]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn import metrics\n", | |
| "print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Print and plot the confusion matrix" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "cm = metrics.confusion_matrix(y_test, y_predicted)\n", | |
| "print(cm)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment