dspp779 · April 10, 2017 16:01
diff --git a/movie_review_sentiment.ipynb b/movie_review_sentiment.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lab 7: Text Classification with SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_files\n",
    "movie_reviews_data_folder = 'movie_reviews/txt_sentoken'\n",
    "dataset = load_files(movie_reviews_data_folder, shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['neg', 'pos']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.target_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['movie_reviews/txt_sentoken/neg/cv000_29416.txt',\n",
       "       'movie_reviews/txt_sentoken/neg/cv001_19502.txt',\n",
       "       'movie_reviews/txt_sentoken/neg/cv002_17424.txt', ...,\n",
       "       'movie_reviews/txt_sentoken/pos/cv997_5046.txt',\n",
       "       'movie_reviews/txt_sentoken/pos/cv998_14111.txt',\n",
       "       'movie_reviews/txt_sentoken/pos/cv999_13106.txt'], \n",
       "      dtype='<U46')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2000"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split data into train data and test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TASK: Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import SVC, LinearSVC, NuSVC\n",
    "\n",
    "# your code here ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pipeline = Pipeline([\n",
    "    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),\n",
    "    ('clf', LinearSVC(C=1000)),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TASK: Build a grid search to find out whether unigrams or bigrams are more useful.\n",
    "\n",
    "Fit the pipeline on the training set using grid search for the parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# your code here ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=None, error_score='raise',\n",
       "       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=0.95, max_features=None, min_df=3,\n",
       "        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
       " ...ax_iter=1000,\n",
       "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
       "     verbose=0))]),\n",
       "       fit_params={}, iid=True, n_jobs=-1,\n",
       "       param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameters = {\n",
    "    'vect__ngram_range': [(1, 1), (1, 2)],\n",
    "}\n",
    "grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)\n",
    "grid_search.fit(docs_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Print the mean and std for each candidate along with the parameter settings for all the candidates explored by grid search."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 params - {'vect__ngram_range': (1, 1)}; mean - 0.84; std - 0.00\n",
      "1 params - {'vect__ngram_range': (1, 2)}; mean - 0.85; std - 0.02\n"
     ]
    }
   ],
   "source": [
    "n_candidates = len(grid_search.cv_results_['params'])\n",
    "for i in range(n_candidates):\n",
    "    print(i, 'params - %s; mean - %0.2f; std - %0.2f'\n",
    "             % (grid_search.cv_results_['params'][i],\n",
    "                grid_search.cv_results_['mean_test_score'][i],\n",
    "                grid_search.cv_results_['std_test_score'][i]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict the outcome on the testing set and store it in a variable named y_predicted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_predicted = grid_search.predict(docs_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Print the classification report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "        neg       0.85      0.82      0.83       256\n",
      "        pos       0.82      0.85      0.83       244\n",
      "\n",
      "avg / total       0.83      0.83      0.83       500\n",
      "\n",
      "[[210  46]\n",
      " [ 37 207]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Print and plot the confusion matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cm = metrics.confusion_matrix(y_test, y_predicted)\n",
    "print(cm)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/news_group.ipynb b/news_group.ipynb
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Lab 7: Text Classification with SVM"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.datasets import load_files\n",
	"movie_reviews_data_folder = 'movie_reviews/txt_sentoken'\n",
	"dataset = load_files(movie_reviews_data_folder, shuffle=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Labels"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['neg', 'pos']"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset.target_names"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Training data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array(['movie_reviews/txt_sentoken/neg/cv000_29416.txt',\n",
	" 'movie_reviews/txt_sentoken/neg/cv001_19502.txt',\n",
	" 'movie_reviews/txt_sentoken/neg/cv002_17424.txt', ...,\n",
	" 'movie_reviews/txt_sentoken/pos/cv997_5046.txt',\n",
	" 'movie_reviews/txt_sentoken/pos/cv998_14111.txt',\n",
	" 'movie_reviews/txt_sentoken/pos/cv999_13106.txt'], \n",
	" dtype='<U46')"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset.filenames"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2000"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(dataset.data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Split data into train data and test data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"TASK: Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.pipeline import Pipeline\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.svm import SVC, LinearSVC, NuSVC\n",
	"\n",
	"# your code here ..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"pipeline = Pipeline([\n",
	" ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),\n",
	" ('clf', LinearSVC(C=1000)),\n",
	"])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"TASK: Build a grid search to find out whether unigrams or bigrams are more useful.\n",
	"\n",
	"Fit the pipeline on the training set using grid search for the parameters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import GridSearchCV\n",
	"\n",
	"# your code here ..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"GridSearchCV(cv=None, error_score='raise',\n",
	" estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
	" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
	" lowercase=True, max_df=0.95, max_features=None, min_df=3,\n",
	" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
	" ...ax_iter=1000,\n",
	" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
	" verbose=0))]),\n",
	" fit_params={}, iid=True, n_jobs=-1,\n",
	" param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},\n",
	" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
	" scoring=None, verbose=0)"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"parameters = {\n",
	" 'vect__ngram_range': [(1, 1), (1, 2)],\n",
	"}\n",
	"grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)\n",
	"grid_search.fit(docs_train, y_train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Print the mean and std for each candidate along with the parameter settings for all the candidates explored by grid search."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 params - {'vect__ngram_range': (1, 1)}; mean - 0.84; std - 0.00\n",
	"1 params - {'vect__ngram_range': (1, 2)}; mean - 0.85; std - 0.02\n"
	]
	}
	],
	"source": [
	"n_candidates = len(grid_search.cv_results_['params'])\n",
	"for i in range(n_candidates):\n",
	" print(i, 'params - %s; mean - %0.2f; std - %0.2f'\n",
	" % (grid_search.cv_results_['params'][i],\n",
	" grid_search.cv_results_['mean_test_score'][i],\n",
	" grid_search.cv_results_['std_test_score'][i]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Predict the outcome on the testing set and store it in a variable named y_predicted"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"y_predicted = grid_search.predict(docs_test)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Print the classification report"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" neg 0.85 0.82 0.83 256\n",
	" pos 0.82 0.85 0.83 244\n",
	"\n",
	"avg / total 0.83 0.83 0.83 500\n",
	"\n",
	"[[210 46]\n",
	" [ 37 207]]\n"
	]
	}
	],
	"source": [
	"from sklearn import metrics\n",
	"print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Print and plot the confusion matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"cm = metrics.confusion_matrix(y_test, y_predicted)\n",
	"print(cm)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
No results found