Created
January 23, 2017 05:14
-
-
Save neil90-db/4e2a3a545ed37a02840307a38f7e2aeb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## SQL to Pandas Example\n", | |
| "\n", | |
| "### This is just a quick demo of converting SQL(In this case HQL) code to Pandas\n", | |
| "#### The example I use is Hortonworks Clickstream example here -> http://hortonworks.com/hadoop-tutorial/how-to-visualize-website-clickstream-data/#section_2 (There is no particular reason why I used this demo)\n", | |
| "#### The relevant poritions to this demo is just the data files regusers.tsv/urlmap.tsv/0.tsv\n", | |
| "#### As well as the SQL Code which we are converting -" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from zipfile import ZipFile\n", | |
| "from io import BytesIO\n", | |
| "import datetime as dt\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import requests" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sql = \"\"\"\n", | |
| "select to_date(o.ts) logdate, o.url, o.ip, o.city, upper(o.state) state,\n", | |
| "o.country, p.category, CAST(datediff( from_unixtime( unix_timestamp() ),\n", | |
| "from_unixtime( unix_timestamp(u.birth_dt, 'dd-MMM-yy'))) / 365 AS INT) age, u.gender_cd\n", | |
| "from omniture o\n", | |
| "inner join products p \n", | |
| "on o.url = p.url\n", | |
| "left outer join users u\n", | |
| "on o.swid = concat('{', u.swid , '}')\"\"\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### 1) Download and Create DFs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#Download and load zip file in memory\n", | |
| "zipurl = 'https://s3.amazonaws.com/hw-sandbox/tutorial8/RefineDemoData.zip'\n", | |
| "r = requests.get(zipurl)\n", | |
| "z = ZipFile(BytesIO(r.content))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#Regusers and urlmap are easily read by passing \\t delimiter\n", | |
| "dfusers = pd.read_csv(z.open('data/users.tsv.gz'), sep = '\\t', compression='gzip')\n", | |
| "dfproduct = pd.read_csv(z.open('data/products.tsv.gz'), sep = '\\t', compression='gzip')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/neil/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (14,15,18,21,22,51,69,75,76,77,79,108,109,113,114,125,176) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
| " interactivity=interactivity, compiler=compiler, result=result)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "#Omniture data has no header and per the example we only use 53 cols\n", | |
| "dfominture = pd.read_csv(z.open('data/Omniture.0.tsv.gz'), sep = '\\t', compression='gzip', header=None)\n", | |
| "dfominture = dfominture.iloc[:,0:53]\n", | |
| "\n", | |
| "#Create dummy cols\n", | |
| "omniturecols = ['col_' + str(i) for i in range(1,54)]\n", | |
| "\n", | |
| "#Assign dummy columns to omniture dataframe\n", | |
| "dfominture.columns = omniturecols" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#We only need certain columns so lets create a new omniture df witht he columns we want\n", | |
| "dfviewomniture = dfominture[[\"col_2\", \"col_8\", \"col_13\",\"col_14\", \"col_50\", \"col_51\", \"col_53\"]]\n", | |
| "#Next lets assign more meaning full name to those columns\n", | |
| "dfviewomniture.columns = [\"ts\", \"ip\", \"url\", \"swid\", \"city\", \"country\", \"state\"]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### 2) Joining DFs\n", | |
| "First join we will replicate is the inner join, which joins omniture data and products data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "dfprodomni = dfviewomniture.merge(dfproduct, how = 'inner', on = 'url', suffixes=('_omni', '_prod'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Next join is replicating the left join between users and omniture data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#First thing we need to do is create a column in users df that has the SWID column concatentated with { }\n", | |
| "dfusers['swid'] = '{' + dfusers['SWID'] + '}' \n", | |
| "\n", | |
| "#Now we can join the 2\n", | |
| "dffinaljoin = dfprodomni.merge(dfusers, how = 'left', on = 'swid', suffixes=('_omniprod', '_users'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 3) Creating Custom Columns\n", | |
| "The select porition of the sql statement does a few things, such as converting timestamp column to a date column, convert state to all caps, as well as calculating age based on the birthdate and current date" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# First lets convert the ts column to date\n", | |
| "dffinaljoin['logdate'] = dffinaljoin['ts'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Next lets create the state column with all upper case\n", | |
| "dffinaljoin['state'] = dffinaljoin['state'].apply(lambda x: x.upper()) #Note this will replace the current state col" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Finally lets calculate the age\n", | |
| "# Here is the function we will use to calculate age\n", | |
| "def getAge(x):\n", | |
| " if type(x) == type(np.NaN):\n", | |
| " return np.NaN\n", | |
| " else:\n", | |
| " return int((dt.datetime.now().date() -\\\n", | |
| " dt.datetime.strptime(x,'%d-%b-%y').date()).days / 365)\n", | |
| " \n", | |
| "# Now to pass the function and create the column\n", | |
| "dffinaljoin['age'] = dffinaljoin['BIRTH_DT'].apply(getAge)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# We have final df setup\n", | |
| "# Last thing we should do is create a dataframe wiht only the columns we want\n", | |
| "sql_to_df = dffinaljoin[['logdate', 'url', 'ip', 'city', 'state', 'country', 'category', 'age', 'GENDER_CD']]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>logdate</th>\n", | |
| " <th>url</th>\n", | |
| " <th>ip</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>country</th>\n", | |
| " <th>category</th>\n", | |
| " <th>age</th>\n", | |
| " <th>GENDER_CD</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>99.122.210.248</td>\n", | |
| " <td>homestead</td>\n", | |
| " <td>FL</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>129.119.158.240</td>\n", | |
| " <td>dallas</td>\n", | |
| " <td>TX</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>26.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>71.53.206.175</td>\n", | |
| " <td>charlottesville</td>\n", | |
| " <td>VA</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>25.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>74.240.132.6</td>\n", | |
| " <td>slidell</td>\n", | |
| " <td>LA</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>25.0</td>\n", | |
| " <td>U</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>74.190.188.100</td>\n", | |
| " <td>atlanta</td>\n", | |
| " <td>GA</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>25.0</td>\n", | |
| " <td>M</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " logdate url ip \\\n", | |
| "0 2012-03-15 http://www.acme.com/SH55126545/VD55170364 99.122.210.248 \n", | |
| "1 2012-03-15 http://www.acme.com/SH55126545/VD55170364 129.119.158.240 \n", | |
| "2 2012-03-15 http://www.acme.com/SH55126545/VD55170364 71.53.206.175 \n", | |
| "3 2012-03-15 http://www.acme.com/SH55126545/VD55170364 74.240.132.6 \n", | |
| "4 2012-03-15 http://www.acme.com/SH55126545/VD55170364 74.190.188.100 \n", | |
| "\n", | |
| " city state country category age GENDER_CD \n", | |
| "0 homestead FL usa home&garden NaN NaN \n", | |
| "1 dallas TX usa home&garden 26.0 F \n", | |
| "2 charlottesville VA usa home&garden 25.0 F \n", | |
| "3 slidell LA usa home&garden 25.0 U \n", | |
| "4 atlanta GA usa home&garden 25.0 M " | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "#Display first 5 lines\n", | |
| "sql_to_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>logdate</th>\n", | |
| " <th>url</th>\n", | |
| " <th>ip</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>country</th>\n", | |
| " <th>category</th>\n", | |
| " <th>age</th>\n", | |
| " <th>GENDER_CD</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>129.119.158.240</td>\n", | |
| " <td>dallas</td>\n", | |
| " <td>TX</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>26.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>71.53.206.175</td>\n", | |
| " <td>charlottesville</td>\n", | |
| " <td>VA</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>25.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>108.18.57.30</td>\n", | |
| " <td>alexandria</td>\n", | |
| " <td>VA</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>29.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>76.89.18.233</td>\n", | |
| " <td>bridgeport</td>\n", | |
| " <td>WV</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>25.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>2012-03-15</td>\n", | |
| " <td>http://www.acme.com/SH55126545/VD55170364</td>\n", | |
| " <td>99.45.50.183</td>\n", | |
| " <td>nashville</td>\n", | |
| " <td>TN</td>\n", | |
| " <td>usa</td>\n", | |
| " <td>home&garden</td>\n", | |
| " <td>24.0</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " logdate url ip \\\n", | |
| "1 2012-03-15 http://www.acme.com/SH55126545/VD55170364 129.119.158.240 \n", | |
| "2 2012-03-15 http://www.acme.com/SH55126545/VD55170364 71.53.206.175 \n", | |
| "5 2012-03-15 http://www.acme.com/SH55126545/VD55170364 108.18.57.30 \n", | |
| "6 2012-03-15 http://www.acme.com/SH55126545/VD55170364 76.89.18.233 \n", | |
| "8 2012-03-15 http://www.acme.com/SH55126545/VD55170364 99.45.50.183 \n", | |
| "\n", | |
| " city state country category age GENDER_CD \n", | |
| "1 dallas TX usa home&garden 26.0 F \n", | |
| "2 charlottesville VA usa home&garden 25.0 F \n", | |
| "5 alexandria VA usa home&garden 29.0 F \n", | |
| "6 bridgeport WV usa home&garden 25.0 F \n", | |
| "8 nashville TN usa home&garden 24.0 F " | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "#Finally lets filter (SQL Where) on GENDER_CD where equals 'F'\n", | |
| "sql_to_df[(sql_to_df.GENDER_CD == 'F')].head() " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "anaconda-cloud": {}, | |
| "kernelspec": { | |
| "display_name": "Python [default]", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment