Created
November 2, 2016 21:01
-
-
Save kanungo/a7abd140f972951564024db4dd68d87e to your computer and use it in GitHub Desktop.
wk-10b-Fall 2016-random-forest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>x1</th>\n", | |
| " <th>x2</th>\n", | |
| " <th>x3</th>\n", | |
| " <th>x4</th>\n", | |
| " <th>x5</th>\n", | |
| " <th>x6</th>\n", | |
| " <th>x7</th>\n", | |
| " <th>Y</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>4.19</td>\n", | |
| " <td>16.15</td>\n", | |
| " <td>12.05</td>\n", | |
| " <td>32.62</td>\n", | |
| " <td>46.90</td>\n", | |
| " <td>62.87</td>\n", | |
| " <td>64.69</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>3.42</td>\n", | |
| " <td>11.03</td>\n", | |
| " <td>13.21</td>\n", | |
| " <td>13.81</td>\n", | |
| " <td>30.18</td>\n", | |
| " <td>55.04</td>\n", | |
| " <td>62.54</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>5.20</td>\n", | |
| " <td>6.22</td>\n", | |
| " <td>15.15</td>\n", | |
| " <td>35.29</td>\n", | |
| " <td>28.50</td>\n", | |
| " <td>36.53</td>\n", | |
| " <td>91.71</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4.31</td>\n", | |
| " <td>8.82</td>\n", | |
| " <td>16.89</td>\n", | |
| " <td>27.40</td>\n", | |
| " <td>43.41</td>\n", | |
| " <td>65.96</td>\n", | |
| " <td>78.77</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>4.32</td>\n", | |
| " <td>12.75</td>\n", | |
| " <td>18.66</td>\n", | |
| " <td>34.15</td>\n", | |
| " <td>13.97</td>\n", | |
| " <td>51.44</td>\n", | |
| " <td>50.80</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " x1 x2 x3 x4 x5 x6 x7 Y\n", | |
| "0 4.19 16.15 12.05 32.62 46.90 62.87 64.69 1\n", | |
| "1 3.42 11.03 13.21 13.81 30.18 55.04 62.54 1\n", | |
| "2 5.20 6.22 15.15 35.29 28.50 36.53 91.71 1\n", | |
| "3 4.31 8.82 16.89 27.40 43.41 65.96 78.77 1\n", | |
| "4 4.32 12.75 18.66 34.15 13.97 51.44 50.80 1" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Import data\n", | |
| "import pandas as pd\n", | |
| "df=pd.read_csv(\"/home/drk/kanungo/DNSC6211/w10/wk10b-datav2.csv\")\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Create a training and test set\n", | |
| "from sklearn.cross_validation import train_test_split\n", | |
| "train, test = train_test_split(df, train_size=0.75, random_state=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "dfTrain = pd.DataFrame(train, columns=df.columns)\n", | |
| "dfTest = pd.DataFrame(test, columns=df.columns)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.ensemble import RandomForestClassifier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# the data have to be in a numpy array in order for\n", | |
| "# the random forest algorithm to accept it!\n", | |
| "# Also, output must be separated.\n", | |
| "cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] \n", | |
| "colsRes = ['Y']\n", | |
| "cols" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([[ 3.77, 6.84, 21.83, ..., 59.28, 68.43, 82.4 ],\n", | |
| " [ 34.66, 29.6 , 52.32, ..., 95.03, 33.69, 93.63],\n", | |
| " [ 57.75, 16.83, 48.97, ..., 104.63, 36.01, 132.76],\n", | |
| " ..., \n", | |
| " [ 6.91, 12.24, 16.98, ..., 40.74, 49.66, 48.8 ],\n", | |
| " [ 25.8 , 19.43, 69.56, ..., 68.83, 86.62, 43.73],\n", | |
| " [ 4.88, 15.59, 16.9 , ..., 27.09, 99.51, 75.68]])" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "trainArr = dfTrain.as_matrix(cols) #training array\n", | |
| "trainRes = dfTrain.as_matrix(colsRes) # training results\n", | |
| "trainArr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/drk/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", | |
| " app.launch_new_instance()\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", | |
| " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | |
| " min_samples_leaf=1, min_samples_split=2,\n", | |
| " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", | |
| " oob_score=False, random_state=None, verbose=0,\n", | |
| " warm_start=False)" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "## Training!\n", | |
| "rf = RandomForestClassifier() # initialize\n", | |
| "rf.fit(trainArr, trainRes) # fit the data to the algorithm" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "## Testing!\n", | |
| "# put the test data in the same format!\n", | |
| "testArr = dfTest.as_matrix(cols)\n", | |
| "results = rf.predict(testArr)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Add result back to the data frame, so I can compare side-by-side\n", | |
| "dfTest['predictions'] = results" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>x1</th>\n", | |
| " <th>x2</th>\n", | |
| " <th>x3</th>\n", | |
| " <th>x4</th>\n", | |
| " <th>x5</th>\n", | |
| " <th>x6</th>\n", | |
| " <th>x7</th>\n", | |
| " <th>Y</th>\n", | |
| " <th>predictions</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>529</th>\n", | |
| " <td>39.29</td>\n", | |
| " <td>17.12</td>\n", | |
| " <td>31.87</td>\n", | |
| " <td>23.34</td>\n", | |
| " <td>120.24</td>\n", | |
| " <td>18.47</td>\n", | |
| " <td>188.23</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>403</th>\n", | |
| " <td>33.73</td>\n", | |
| " <td>18.52</td>\n", | |
| " <td>53.75</td>\n", | |
| " <td>24.39</td>\n", | |
| " <td>103.64</td>\n", | |
| " <td>34.97</td>\n", | |
| " <td>122.41</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>464</th>\n", | |
| " <td>17.83</td>\n", | |
| " <td>24.59</td>\n", | |
| " <td>80.69</td>\n", | |
| " <td>15.04</td>\n", | |
| " <td>90.65</td>\n", | |
| " <td>21.46</td>\n", | |
| " <td>127.57</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>142</th>\n", | |
| " <td>16.19</td>\n", | |
| " <td>38.34</td>\n", | |
| " <td>39.56</td>\n", | |
| " <td>44.62</td>\n", | |
| " <td>64.12</td>\n", | |
| " <td>55.20</td>\n", | |
| " <td>52.93</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>521</th>\n", | |
| " <td>40.34</td>\n", | |
| " <td>18.47</td>\n", | |
| " <td>52.77</td>\n", | |
| " <td>7.86</td>\n", | |
| " <td>116.33</td>\n", | |
| " <td>49.84</td>\n", | |
| " <td>125.93</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " x1 x2 x3 x4 x5 x6 x7 Y predictions\n", | |
| "529 39.29 17.12 31.87 23.34 120.24 18.47 188.23 3 3\n", | |
| "403 33.73 18.52 53.75 24.39 103.64 34.97 122.41 3 3\n", | |
| "464 17.83 24.59 80.69 15.04 90.65 21.46 127.57 3 3\n", | |
| "142 16.19 38.34 39.56 44.62 64.12 55.20 52.93 2 2\n", | |
| "521 40.34 18.47 52.77 7.86 116.33 49.84 125.93 3 3" | |
| ] | |
| }, | |
| "execution_count": 37, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dfTest.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " 1 0.97 1.00 0.98 28\n", | |
| " 2 0.94 0.94 0.94 34\n", | |
| " 3 0.99 0.97 0.98 72\n", | |
| "\n", | |
| "avg / total 0.97 0.97 0.97 134\n", | |
| "\n", | |
| "[[28 0 0]\n", | |
| " [ 1 32 1]\n", | |
| " [ 0 2 70]]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn import metrics\n", | |
| "\n", | |
| "# summarize the fit of the model\n", | |
| "print(metrics.classification_report(dfTest.Y, dfTest.predictions))\n", | |
| "print(metrics.confusion_matrix(dfTest.Y, dfTest.predictions))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# http://stackoverflow.com/questions/29221849/splitting-data-into-test-and-train-making-a-logistic-regression-model-in-pandas\n", | |
| "import pandas as pd\n", | |
| "from sklearn.cross_validation import train_test_split\n", | |
| "import statsmodels.api as sm\n", | |
| "\n", | |
| "quality = pd.read_csv(\"https://courses.edx.org/c4x/MITx/15.071x/asset/quality.csv\")\n", | |
| "train, test = train_test_split(quality, train_size=0.75, random_state=1)\n", | |
| "\n", | |
| "qualityTrain = pd.DataFrame(train, columns=quality.columns)\n", | |
| "qualityTest = pd.DataFrame(test, columns=quality.columns)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "anaconda-cloud": {}, | |
| "kernelspec": { | |
| "display_name": "Python [conda root]", | |
| "language": "python", | |
| "name": "conda-root-py" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment