Created
June 11, 2014 19:29
-
-
Save spartonia/fed93de37de80167a7f8 to your computer and use it in GitHub Desktop.
AsSquare Data Science Teaser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "DS-Teaser" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "import pandas as pd ", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "data = pd.read_json('adsquare_teaser_dataMay2014.json')\ndata.head()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Earning</th>\n <th>Index</th>\n <th>X1</th>\n <th>X2</th>\n <th>X3</th>\n <th>X4</th>\n <th>X5</th>\n <th>Zeta</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0 </th>\n <td>-220.974098</td>\n <td> 0</td>\n <td> 11.255087</td>\n <td> 1.388020</td>\n <td> 2.944195</td>\n <td> 16.699407</td>\n <td> -4.294483</td>\n <td> NaN</td>\n </tr>\n <tr>\n <th>1 </th>\n <td> 221.004807</td>\n <td> 1</td>\n <td> 9.330658</td>\n <td> 15.812443</td>\n <td> 2.145769</td>\n <td> 4.270980</td>\n <td> 24.825712</td>\n <td> 2.603850</td>\n </tr>\n <tr>\n <th>10</th>\n <td> 141.888054</td>\n <td> 10</td>\n <td> 9.477666</td>\n <td> -5.995510</td>\n <td> 1.585362</td>\n <td> 5.228900</td>\n <td> 12.033579</td>\n <td> 1.004182</td>\n </tr>\n <tr>\n <th>11</th>\n <td> 33.492040</td>\n <td> 11</td>\n <td> 12.094787</td>\n <td> 14.534415</td>\n <td> 3.005121</td>\n <td> -8.742146</td>\n <td> -8.208530</td>\n <td> NaN</td>\n </tr>\n <tr>\n <th>12</th>\n <td> 360.774589</td>\n <td> 12</td>\n <td> 19.181625</td>\n <td> 13.264693</td>\n <td> 1.295070</td>\n <td>-11.264031</td>\n <td> 0.109002</td>\n <td> NaN</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows \u00d7 8 columns</p>\n</div>", | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 2, | |
| "text": " Earning Index X1 X2 X3 X4 X5 \\\n0 -220.974098 0 11.255087 1.388020 2.944195 16.699407 -4.294483 \n1 221.004807 1 9.330658 15.812443 2.145769 4.270980 24.825712 \n10 141.888054 10 9.477666 -5.995510 1.585362 5.228900 12.033579 \n11 33.492040 11 12.094787 14.534415 3.005121 -8.742146 -8.208530 \n12 360.774589 12 19.181625 13.264693 1.295070 -11.264031 0.109002 \n\n Zeta \n0 NaN \n1 2.603850 \n10 1.004182 \n11 NaN \n12 NaN \n\n[5 rows x 8 columns]" | |
| } | |
| ], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "from pylab import * \nimport numpy as np ", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "datacols = data.drop(['Index', 'Zeta'], axis=1)\nprint datacols.columns", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "Index([u'Earning', u'X1', u'X2', u'X3', u'X4', u'X5'], dtype='object')\n" | |
| } | |
| ], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": "According to the correlation matrix, X4 has highest relationship(inversely) with Earnings(-8.02). X2 comes after X4 with direct relationships of 0.245. As a rule of thumb, combining two or more variables together to predict some criterion variable will be best or maximally beneficial when each of the predictors correlate with the criterion to a reasnable degree, but where the predictors correlate as little with each other. In our case, X4, X2 meet both requirements. \n" | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "print 'Correlation to Earnings:\\n'\ndatacols.corr()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "Correlation to Earnings:\n\n" | |
| }, | |
| { | |
| "html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Earning</th>\n <th>X1</th>\n <th>X2</th>\n <th>X3</th>\n <th>X4</th>\n <th>X5</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>Earning</th>\n <td> 1.000000</td>\n <td> 0.052315</td>\n <td> 0.244927</td>\n <td>-0.014819</td>\n <td>-0.802799</td>\n <td> 0.124557</td>\n </tr>\n <tr>\n <th>X1</th>\n <td> 0.052315</td>\n <td> 1.000000</td>\n <td>-0.108290</td>\n <td> 0.163613</td>\n <td>-0.057357</td>\n <td> 0.067809</td>\n </tr>\n <tr>\n <th>X2</th>\n <td> 0.244927</td>\n <td>-0.108290</td>\n <td> 1.000000</td>\n <td>-0.034498</td>\n <td>-0.054824</td>\n <td>-0.119888</td>\n </tr>\n <tr>\n <th>X3</th>\n <td>-0.014819</td>\n <td> 0.163613</td>\n <td>-0.034498</td>\n <td> 1.000000</td>\n <td> 0.070506</td>\n <td>-0.149118</td>\n </tr>\n <tr>\n <th>X4</th>\n <td>-0.802799</td>\n <td>-0.057357</td>\n <td>-0.054824</td>\n <td> 0.070506</td>\n <td> 1.000000</td>\n <td>-0.047036</td>\n </tr>\n <tr>\n <th>X5</th>\n <td> 0.124557</td>\n <td> 0.067809</td>\n <td>-0.119888</td>\n <td>-0.149118</td>\n <td>-0.047036</td>\n <td> 1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>6 rows \u00d7 6 columns</p>\n</div>", | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 7, | |
| "text": " Earning X1 X2 X3 X4 X5\nEarning 1.000000 0.052315 0.244927 -0.014819 -0.802799 0.124557\nX1 0.052315 1.000000 -0.108290 0.163613 -0.057357 0.067809\nX2 0.244927 -0.108290 1.000000 -0.034498 -0.054824 -0.119888\nX3 -0.014819 0.163613 -0.034498 1.000000 0.070506 -0.149118\nX4 -0.802799 -0.057357 -0.054824 0.070506 1.000000 -0.047036\nX5 0.124557 0.067809 -0.119888 -0.149118 -0.047036 1.000000\n\n[6 rows x 6 columns]" | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "pd.tools.plotting.scatter_matrix(datacols)\nshow() ", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "subplot(2,1,1)\nscatter(data['X4'], data['Earning'])\ntitle('X4')\nsubplot(2,1,2)\nscatter(data['X2'], data['Earning'])\ntitle('X2')\nshow()\n", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "# drop null values\nnnData = data[~data['Zeta'].isnull()].drop(['Index'], axis=1)", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "nnData.corr()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Earning</th>\n <th>X1</th>\n <th>X2</th>\n <th>X3</th>\n <th>X4</th>\n <th>X5</th>\n <th>Zeta</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>Earning</th>\n <td> 1.000000</td>\n <td>-0.122537</td>\n <td> 0.404612</td>\n <td> 0.129291</td>\n <td>-0.854859</td>\n <td> 0.079286</td>\n <td>-0.759428</td>\n </tr>\n <tr>\n <th>X1</th>\n <td>-0.122537</td>\n <td> 1.000000</td>\n <td>-0.207274</td>\n <td> 0.018572</td>\n <td> 0.265998</td>\n <td> 0.368986</td>\n <td> 0.284664</td>\n </tr>\n <tr>\n <th>X2</th>\n <td> 0.404612</td>\n <td>-0.207274</td>\n <td> 1.000000</td>\n <td> 0.186440</td>\n <td>-0.260368</td>\n <td> 0.002690</td>\n <td>-0.196859</td>\n </tr>\n <tr>\n <th>X3</th>\n <td> 0.129291</td>\n <td> 0.018572</td>\n <td> 0.186440</td>\n <td> 1.000000</td>\n <td>-0.009908</td>\n <td>-0.002038</td>\n <td>-0.082602</td>\n </tr>\n <tr>\n <th>X4</th>\n <td>-0.854859</td>\n <td> 0.265998</td>\n <td>-0.260368</td>\n <td>-0.009908</td>\n <td> 1.000000</td>\n <td> 0.047339</td>\n <td> 0.914270</td>\n </tr>\n <tr>\n <th>X5</th>\n <td> 0.079286</td>\n <td> 0.368986</td>\n <td> 0.002690</td>\n <td>-0.002038</td>\n <td> 0.047339</td>\n <td> 1.000000</td>\n <td> 0.216220</td>\n </tr>\n <tr>\n <th>Zeta</th>\n <td>-0.759428</td>\n <td> 0.284664</td>\n <td>-0.196859</td>\n <td>-0.082602</td>\n <td> 0.914270</td>\n <td> 0.216220</td>\n <td> 1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>7 rows \u00d7 7 columns</p>\n</div>", | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 14, | |
| "text": " Earning X1 X2 X3 X4 X5 Zeta\nEarning 1.000000 -0.122537 0.404612 0.129291 -0.854859 0.079286 -0.759428\nX1 -0.122537 1.000000 -0.207274 0.018572 0.265998 0.368986 0.284664\nX2 0.404612 -0.207274 1.000000 0.186440 -0.260368 0.002690 -0.196859\nX3 0.129291 0.018572 0.186440 1.000000 -0.009908 -0.002038 -0.082602\nX4 -0.854859 0.265998 -0.260368 -0.009908 1.000000 0.047339 0.914270\nX5 0.079286 0.368986 0.002690 -0.002038 0.047339 1.000000 0.216220\nZeta -0.759428 0.284664 -0.196859 -0.082602 0.914270 0.216220 1.000000\n\n[7 rows x 7 columns]" | |
| } | |
| ], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": "Since Zeta is correlated with X4, we can fit a line Z = mx + c to estimate the missing values for Zeta, where x is in X4. " | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "nnData['ones'] = 1\neqCols = ['X4', 'ones']\n# nnData[eqCols]\nm, c = np.linalg.lstsq(nnData[eqCols], nnData['Zeta'])[0]\nprint m, c ", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "0.2152890762 0.828497305179\n" | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "# plot fitting line \nfigure()\nscatter(data['X4'], data['Zeta'], label='Zeta Values')\nplot(nnData['X4'], m*nnData['X4'] + c, 'r', label='Fitted Curve')\ntitle('Fitting Zeta values to a line')\nlegend()\nshow()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "# fill null values with new estimates \ndata['nZeta'] = data['Zeta']\ndata['nZeta'].fillna(m*data['X4'] + c, inplace=True)\nstd = data['Zeta'].std()\n\n", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 44 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "data.head()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Earning</th>\n <th>Index</th>\n <th>X1</th>\n <th>X2</th>\n <th>X3</th>\n <th>X4</th>\n <th>X5</th>\n <th>Zeta</th>\n <th>nZeta</th>\n <th>zZetaR</th>\n <th>nZetaR</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0 </th>\n <td>-220.974098</td>\n <td> 0</td>\n <td> 11.255087</td>\n <td> 1.388020</td>\n <td> 2.944195</td>\n <td> 16.699407</td>\n <td> -4.294483</td>\n <td> NaN</td>\n <td> 4.423697</td>\n <td> 4.715908</td>\n <td> 4.316327</td>\n </tr>\n <tr>\n <th>1 </th>\n <td> 221.004807</td>\n <td> 1</td>\n <td> 9.330658</td>\n <td> 15.812443</td>\n <td> 2.145769</td>\n <td> 4.270980</td>\n <td> 24.825712</td>\n <td> 2.603850</td>\n <td> 2.603850</td>\n <td> 2.603850</td>\n <td> 2.603850</td>\n </tr>\n <tr>\n <th>10</th>\n <td> 141.888054</td>\n <td> 10</td>\n <td> 9.477666</td>\n <td> -5.995510</td>\n <td> 1.585362</td>\n <td> 5.228900</td>\n <td> 12.033579</td>\n <td> 1.004182</td>\n <td> 1.004182</td>\n <td> 1.004182</td>\n <td> 1.004182</td>\n </tr>\n <tr>\n <th>11</th>\n <td> 33.492040</td>\n <td> 11</td>\n <td> 12.094787</td>\n <td> 14.534415</td>\n <td> 3.005121</td>\n <td> -8.742146</td>\n <td> -8.208530</td>\n <td> NaN</td>\n <td>-1.053591</td>\n <td>-0.761380</td>\n <td>-1.160961</td>\n </tr>\n <tr>\n <th>12</th>\n <td> 360.774589</td>\n <td> 12</td>\n <td> 19.181625</td>\n <td> 13.264693</td>\n <td> 1.295070</td>\n <td>-11.264031</td>\n <td> 0.109002</td>\n <td> NaN</td>\n <td>-1.596526</td>\n <td>-1.304314</td>\n <td>-1.703896</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows \u00d7 11 columns</p>\n</div>", | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 45, | |
| "text": " Earning Index X1 X2 X3 X4 X5 \\\n0 -220.974098 0 11.255087 1.388020 2.944195 16.699407 -4.294483 \n1 221.004807 1 9.330658 15.812443 2.145769 4.270980 24.825712 \n10 141.888054 10 9.477666 -5.995510 1.585362 5.228900 12.033579 \n11 33.492040 11 12.094787 14.534415 3.005121 -8.742146 -8.208530 \n12 360.774589 12 19.181625 13.264693 1.295070 -11.264031 0.109002 \n\n Zeta nZeta zZetaR nZetaR \n0 NaN 4.423697 4.715908 4.316327 \n1 2.603850 2.603850 2.603850 2.603850 \n10 1.004182 1.004182 1.004182 1.004182 \n11 NaN -1.053591 -0.761380 -1.160961 \n12 NaN -1.596526 -1.304314 -1.703896 \n\n[5 rows x 11 columns]" | |
| } | |
| ], | |
| "prompt_number": 45 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "# plot new replaced Data \nfigure()\nscatter(data['X4'], data['nZeta'],label='Predicted', color='r')\nscatter(data['X4'], data['Zeta'], label='Original')\nxlabel('X4');ylabel('Zeta')\nlegend(loc='upper left')\nshow()", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 46 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment