neil90-db · January 23, 2017 05:14
diff --git a/SQL_to_Pandas.ipynb b/SQL_to_Pandas.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SQL to Pandas Example\n",
    "\n",
    "### This is just a quick demo of converting SQL(In this case HQL) code to Pandas\n",
    "#### The example I use is Hortonworks Clickstream example here -> http://hortonworks.com/hadoop-tutorial/how-to-visualize-website-clickstream-data/#section_2 (There is no particular reason why I used this demo)\n",
    "#### The relevant poritions to this demo is just the data files regusers.tsv/urlmap.tsv/0.tsv\n",
    "#### As well as the SQL Code which we are converting -"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from zipfile import ZipFile\n",
    "from io import BytesIO\n",
    "import datetime as dt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sql = \"\"\"\n",
    "select to_date(o.ts) logdate, o.url, o.ip, o.city, upper(o.state) state,\n",
    "o.country, p.category, CAST(datediff( from_unixtime( unix_timestamp() ),\n",
    "from_unixtime( unix_timestamp(u.birth_dt, 'dd-MMM-yy'))) / 365  AS INT) age, u.gender_cd\n",
    "from omniture o\n",
    "inner join products p     \n",
    "on o.url = p.url\n",
    "left outer join users u\n",
    "on o.swid = concat('{', u.swid , '}')\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1) Download and Create DFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Download and load zip file in memory\n",
    "zipurl = 'https://s3.amazonaws.com/hw-sandbox/tutorial8/RefineDemoData.zip'\n",
    "r = requests.get(zipurl)\n",
    "z = ZipFile(BytesIO(r.content))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Regusers and urlmap are easily read by passing \\t delimiter\n",
    "dfusers = pd.read_csv(z.open('data/users.tsv.gz'), sep = '\\t', compression='gzip')\n",
    "dfproduct = pd.read_csv(z.open('data/products.tsv.gz'), sep = '\\t', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/neil/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (14,15,18,21,22,51,69,75,76,77,79,108,109,113,114,125,176) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "#Omniture data has no header and per the example we only use 53 cols\n",
    "dfominture = pd.read_csv(z.open('data/Omniture.0.tsv.gz'), sep = '\\t', compression='gzip', header=None)\n",
    "dfominture = dfominture.iloc[:,0:53]\n",
    "\n",
    "#Create dummy cols\n",
    "omniturecols = ['col_' + str(i) for i in range(1,54)]\n",
    "\n",
    "#Assign dummy columns to omniture dataframe\n",
    "dfominture.columns = omniturecols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#We only need certain columns so lets create a new omniture df witht he columns we want\n",
    "dfviewomniture = dfominture[[\"col_2\", \"col_8\", \"col_13\",\"col_14\", \"col_50\", \"col_51\", \"col_53\"]]\n",
    "#Next lets assign more meaning full name to those columns\n",
    "dfviewomniture.columns = [\"ts\", \"ip\", \"url\", \"swid\", \"city\", \"country\", \"state\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2) Joining DFs\n",
    "First join we will replicate is the inner join, which joins omniture data and products data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfprodomni = dfviewomniture.merge(dfproduct, how = 'inner', on = 'url', suffixes=('_omni', '_prod'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next join is replicating the left join between users and omniture data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#First thing we need to do is create a column in users df that has the SWID column concatentated with { }\n",
    "dfusers['swid'] = '{' + dfusers['SWID'] + '}' \n",
    "\n",
    "#Now we can join the 2\n",
    "dffinaljoin = dfprodomni.merge(dfusers, how = 'left', on = 'swid', suffixes=('_omniprod', '_users'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3) Creating Custom Columns\n",
    "The select porition of the sql statement does a few things, such as converting timestamp column to a date column, convert state to all caps, as well as calculating age based on the birthdate and current date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# First lets convert the ts column to date\n",
    "dffinaljoin['logdate'] = dffinaljoin['ts'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Next lets create the state column with all upper case\n",
    "dffinaljoin['state'] = dffinaljoin['state'].apply(lambda x: x.upper()) #Note this will replace the current state col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Finally lets calculate the age\n",
    "# Here is the function we will use to calculate age\n",
    "def getAge(x):\n",
    "    if type(x) == type(np.NaN):\n",
    "        return np.NaN\n",
    "    else:\n",
    "        return int((dt.datetime.now().date() -\\\n",
    "       dt.datetime.strptime(x,'%d-%b-%y').date()).days / 365)\n",
    "    \n",
    "# Now to pass the function and create the column\n",
    "dffinaljoin['age'] = dffinaljoin['BIRTH_DT'].apply(getAge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# We have final df setup\n",
    "# Last thing we should do is create a dataframe wiht only the columns we want\n",
    "sql_to_df = dffinaljoin[['logdate', 'url', 'ip', 'city', 'state', 'country', 'category', 'age', 'GENDER_CD']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>logdate</th>\n",
       "      <th>url</th>\n",
       "      <th>ip</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "      <th>category</th>\n",
       "      <th>age</th>\n",
       "      <th>GENDER_CD</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>99.122.210.248</td>\n",
       "      <td>homestead</td>\n",
       "      <td>FL</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>129.119.158.240</td>\n",
       "      <td>dallas</td>\n",
       "      <td>TX</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>26.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>71.53.206.175</td>\n",
       "      <td>charlottesville</td>\n",
       "      <td>VA</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>25.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>74.240.132.6</td>\n",
       "      <td>slidell</td>\n",
       "      <td>LA</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>25.0</td>\n",
       "      <td>U</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>74.190.188.100</td>\n",
       "      <td>atlanta</td>\n",
       "      <td>GA</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>25.0</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      logdate                                        url               ip  \\\n",
       "0  2012-03-15  http://www.acme.com/SH55126545/VD55170364   99.122.210.248   \n",
       "1  2012-03-15  http://www.acme.com/SH55126545/VD55170364  129.119.158.240   \n",
       "2  2012-03-15  http://www.acme.com/SH55126545/VD55170364    71.53.206.175   \n",
       "3  2012-03-15  http://www.acme.com/SH55126545/VD55170364     74.240.132.6   \n",
       "4  2012-03-15  http://www.acme.com/SH55126545/VD55170364   74.190.188.100   \n",
       "\n",
       "              city state country     category   age GENDER_CD  \n",
       "0        homestead    FL     usa  home&garden   NaN       NaN  \n",
       "1           dallas    TX     usa  home&garden  26.0         F  \n",
       "2  charlottesville    VA     usa  home&garden  25.0         F  \n",
       "3          slidell    LA     usa  home&garden  25.0         U  \n",
       "4          atlanta    GA     usa  home&garden  25.0         M  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Display first 5 lines\n",
    "sql_to_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>logdate</th>\n",
       "      <th>url</th>\n",
       "      <th>ip</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "      <th>category</th>\n",
       "      <th>age</th>\n",
       "      <th>GENDER_CD</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>129.119.158.240</td>\n",
       "      <td>dallas</td>\n",
       "      <td>TX</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>26.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>71.53.206.175</td>\n",
       "      <td>charlottesville</td>\n",
       "      <td>VA</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>25.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>108.18.57.30</td>\n",
       "      <td>alexandria</td>\n",
       "      <td>VA</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>29.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>76.89.18.233</td>\n",
       "      <td>bridgeport</td>\n",
       "      <td>WV</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>25.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2012-03-15</td>\n",
       "      <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
       "      <td>99.45.50.183</td>\n",
       "      <td>nashville</td>\n",
       "      <td>TN</td>\n",
       "      <td>usa</td>\n",
       "      <td>home&amp;garden</td>\n",
       "      <td>24.0</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      logdate                                        url               ip  \\\n",
       "1  2012-03-15  http://www.acme.com/SH55126545/VD55170364  129.119.158.240   \n",
       "2  2012-03-15  http://www.acme.com/SH55126545/VD55170364    71.53.206.175   \n",
       "5  2012-03-15  http://www.acme.com/SH55126545/VD55170364     108.18.57.30   \n",
       "6  2012-03-15  http://www.acme.com/SH55126545/VD55170364     76.89.18.233   \n",
       "8  2012-03-15  http://www.acme.com/SH55126545/VD55170364     99.45.50.183   \n",
       "\n",
       "              city state country     category   age GENDER_CD  \n",
       "1           dallas    TX     usa  home&garden  26.0         F  \n",
       "2  charlottesville    VA     usa  home&garden  25.0         F  \n",
       "5       alexandria    VA     usa  home&garden  29.0         F  \n",
       "6       bridgeport    WV     usa  home&garden  25.0         F  \n",
       "8        nashville    TN     usa  home&garden  24.0         F  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Finally lets filter (SQL Where) on GENDER_CD where equals 'F'\n",
    "sql_to_df[(sql_to_df.GENDER_CD == 'F')].head() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## SQL to Pandas Example\n",
	"\n",
	"### This is just a quick demo of converting SQL(In this case HQL) code to Pandas\n",
	"#### The example I use is Hortonworks Clickstream example here -> http://hortonworks.com/hadoop-tutorial/how-to-visualize-website-clickstream-data/#section_2 (There is no particular reason why I used this demo)\n",
	"#### The relevant poritions to this demo is just the data files regusers.tsv/urlmap.tsv/0.tsv\n",
	"#### As well as the SQL Code which we are converting -"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from zipfile import ZipFile\n",
	"from io import BytesIO\n",
	"import datetime as dt\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import requests"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sql = \"\"\"\n",
	"select to_date(o.ts) logdate, o.url, o.ip, o.city, upper(o.state) state,\n",
	"o.country, p.category, CAST(datediff( from_unixtime( unix_timestamp() ),\n",
	"from_unixtime( unix_timestamp(u.birth_dt, 'dd-MMM-yy'))) / 365 AS INT) age, u.gender_cd\n",
	"from omniture o\n",
	"inner join products p \n",
	"on o.url = p.url\n",
	"left outer join users u\n",
	"on o.swid = concat('{', u.swid , '}')\"\"\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 1) Download and Create DFs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Download and load zip file in memory\n",
	"zipurl = 'https://s3.amazonaws.com/hw-sandbox/tutorial8/RefineDemoData.zip'\n",
	"r = requests.get(zipurl)\n",
	"z = ZipFile(BytesIO(r.content))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Regusers and urlmap are easily read by passing \\t delimiter\n",
	"dfusers = pd.read_csv(z.open('data/users.tsv.gz'), sep = '\\t', compression='gzip')\n",
	"dfproduct = pd.read_csv(z.open('data/products.tsv.gz'), sep = '\\t', compression='gzip')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/neil/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (14,15,18,21,22,51,69,75,76,77,79,108,109,113,114,125,176) have mixed types. Specify dtype option on import or set low_memory=False.\n",
	" interactivity=interactivity, compiler=compiler, result=result)\n"
	]
	}
	],
	"source": [
	"#Omniture data has no header and per the example we only use 53 cols\n",
	"dfominture = pd.read_csv(z.open('data/Omniture.0.tsv.gz'), sep = '\\t', compression='gzip', header=None)\n",
	"dfominture = dfominture.iloc[:,0:53]\n",
	"\n",
	"#Create dummy cols\n",
	"omniturecols = ['col_' + str(i) for i in range(1,54)]\n",
	"\n",
	"#Assign dummy columns to omniture dataframe\n",
	"dfominture.columns = omniturecols"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#We only need certain columns so lets create a new omniture df witht he columns we want\n",
	"dfviewomniture = dfominture[[\"col_2\", \"col_8\", \"col_13\",\"col_14\", \"col_50\", \"col_51\", \"col_53\"]]\n",
	"#Next lets assign more meaning full name to those columns\n",
	"dfviewomniture.columns = [\"ts\", \"ip\", \"url\", \"swid\", \"city\", \"country\", \"state\"]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 2) Joining DFs\n",
	"First join we will replicate is the inner join, which joins omniture data and products data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"dfprodomni = dfviewomniture.merge(dfproduct, how = 'inner', on = 'url', suffixes=('_omni', '_prod'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Next join is replicating the left join between users and omniture data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#First thing we need to do is create a column in users df that has the SWID column concatentated with { }\n",
	"dfusers['swid'] = '{' + dfusers['SWID'] + '}' \n",
	"\n",
	"#Now we can join the 2\n",
	"dffinaljoin = dfprodomni.merge(dfusers, how = 'left', on = 'swid', suffixes=('_omniprod', '_users'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 3) Creating Custom Columns\n",
	"The select porition of the sql statement does a few things, such as converting timestamp column to a date column, convert state to all caps, as well as calculating age based on the birthdate and current date"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# First lets convert the ts column to date\n",
	"dffinaljoin['logdate'] = dffinaljoin['ts'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Next lets create the state column with all upper case\n",
	"dffinaljoin['state'] = dffinaljoin['state'].apply(lambda x: x.upper()) #Note this will replace the current state col"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Finally lets calculate the age\n",
	"# Here is the function we will use to calculate age\n",
	"def getAge(x):\n",
	" if type(x) == type(np.NaN):\n",
	" return np.NaN\n",
	" else:\n",
	" return int((dt.datetime.now().date() -\\\n",
	" dt.datetime.strptime(x,'%d-%b-%y').date()).days / 365)\n",
	" \n",
	"# Now to pass the function and create the column\n",
	"dffinaljoin['age'] = dffinaljoin['BIRTH_DT'].apply(getAge)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# We have final df setup\n",
	"# Last thing we should do is create a dataframe wiht only the columns we want\n",
	"sql_to_df = dffinaljoin[['logdate', 'url', 'ip', 'city', 'state', 'country', 'category', 'age', 'GENDER_CD']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>logdate</th>\n",
	" <th>url</th>\n",
	" <th>ip</th>\n",
	" <th>city</th>\n",
	" <th>state</th>\n",
	" <th>country</th>\n",
	" <th>category</th>\n",
	" <th>age</th>\n",
	" <th>GENDER_CD</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>99.122.210.248</td>\n",
	" <td>homestead</td>\n",
	" <td>FL</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>129.119.158.240</td>\n",
	" <td>dallas</td>\n",
	" <td>TX</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>26.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>71.53.206.175</td>\n",
	" <td>charlottesville</td>\n",
	" <td>VA</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>25.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>74.240.132.6</td>\n",
	" <td>slidell</td>\n",
	" <td>LA</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>25.0</td>\n",
	" <td>U</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>74.190.188.100</td>\n",
	" <td>atlanta</td>\n",
	" <td>GA</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>25.0</td>\n",
	" <td>M</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" logdate url ip \\\n",
	"0 2012-03-15 http://www.acme.com/SH55126545/VD55170364 99.122.210.248 \n",
	"1 2012-03-15 http://www.acme.com/SH55126545/VD55170364 129.119.158.240 \n",
	"2 2012-03-15 http://www.acme.com/SH55126545/VD55170364 71.53.206.175 \n",
	"3 2012-03-15 http://www.acme.com/SH55126545/VD55170364 74.240.132.6 \n",
	"4 2012-03-15 http://www.acme.com/SH55126545/VD55170364 74.190.188.100 \n",
	"\n",
	" city state country category age GENDER_CD \n",
	"0 homestead FL usa home&garden NaN NaN \n",
	"1 dallas TX usa home&garden 26.0 F \n",
	"2 charlottesville VA usa home&garden 25.0 F \n",
	"3 slidell LA usa home&garden 25.0 U \n",
	"4 atlanta GA usa home&garden 25.0 M "
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#Display first 5 lines\n",
	"sql_to_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>logdate</th>\n",
	" <th>url</th>\n",
	" <th>ip</th>\n",
	" <th>city</th>\n",
	" <th>state</th>\n",
	" <th>country</th>\n",
	" <th>category</th>\n",
	" <th>age</th>\n",
	" <th>GENDER_CD</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>129.119.158.240</td>\n",
	" <td>dallas</td>\n",
	" <td>TX</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>26.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>71.53.206.175</td>\n",
	" <td>charlottesville</td>\n",
	" <td>VA</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>25.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>108.18.57.30</td>\n",
	" <td>alexandria</td>\n",
	" <td>VA</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>29.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>76.89.18.233</td>\n",
	" <td>bridgeport</td>\n",
	" <td>WV</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>25.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>2012-03-15</td>\n",
	" <td>http://www.acme.com/SH55126545/VD55170364</td>\n",
	" <td>99.45.50.183</td>\n",
	" <td>nashville</td>\n",
	" <td>TN</td>\n",
	" <td>usa</td>\n",
	" <td>home&garden</td>\n",
	" <td>24.0</td>\n",
	" <td>F</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" logdate url ip \\\n",
	"1 2012-03-15 http://www.acme.com/SH55126545/VD55170364 129.119.158.240 \n",
	"2 2012-03-15 http://www.acme.com/SH55126545/VD55170364 71.53.206.175 \n",
	"5 2012-03-15 http://www.acme.com/SH55126545/VD55170364 108.18.57.30 \n",
	"6 2012-03-15 http://www.acme.com/SH55126545/VD55170364 76.89.18.233 \n",
	"8 2012-03-15 http://www.acme.com/SH55126545/VD55170364 99.45.50.183 \n",
	"\n",
	" city state country category age GENDER_CD \n",
	"1 dallas TX usa home&garden 26.0 F \n",
	"2 charlottesville VA usa home&garden 25.0 F \n",
	"5 alexandria VA usa home&garden 29.0 F \n",
	"6 bridgeport WV usa home&garden 25.0 F \n",
	"8 nashville TN usa home&garden 24.0 F "
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#Finally lets filter (SQL Where) on GENDER_CD where equals 'F'\n",
	"sql_to_df[(sql_to_df.GENDER_CD == 'F')].head() "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found