Last active
January 25, 2018 19:10
-
-
Save grahamanderson/986d806e3b53e12b81f4b086034a369d to your computer and use it in GitHub Desktop.
Census API––Percentage Change by Year––Proof of Concept
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "## Percentage Change for Census Variables (by Year)\n- proof of concept\n- Use to get the pct change (by year) b/w any census variable (acs1, sfl)\n- Loads external 'nationalP_places.txt' file to get PlaceFips for a city\n- Myself, I need to get Place-specific changes between variables...Like the Population Growth Mean Income. Also, I don't always know which years are going to be valid. \n- This example searches a variable for its most current year and the most vintage year\n- The Census has a septuagintillion variables...hopefully future iterations will make this easier for me :)" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "import requests\nfrom bs4 import BeautifulSoup\nimport pandas as pd\nimport numpy as np\nimport us\nimport json\n\nimport sqlite3\nDB = './data/sqlite3/acs1.db'\nTABLE = 'Variables'\n\nfrom IPython.core.interactiveshell import InteractiveShell\nInteractiveShell.ast_node_interactivity = \"all\"\n\njson_keys = json.load(open('./data/api_key.json'))\nCENSUS_API_KEY = json_keys['census-api']", | |
| "execution_count": 207, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "### Functions" | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Helper Functions" | |
| }, | |
| { | |
| "metadata": { | |
| "code_folding": [ | |
| 0, | |
| 7, | |
| 18, | |
| 33, | |
| 56 | |
| ], | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "def get_asc_variable(df, search, limit=10):\n '''Search in Variable and Label Columns \n get_asc_variable(df2, 'Population', 10)\n '''\n return df[(df.label.str.contains(search,case=False)) |\n df.index.str.contains(search, case=False)][['label', 'concept']][0:limit]\n\n\ndef geoid(row):\n if row['PLACEFP'] != '00000':\n return '16000US' + row['STATEFP'] + row['PLACEFP']\n\n\n# elif row['city_fips'] != '00000':\n# return '16000US' + row['state_fips'] + row['city_fips']\n else:\n return 'unknown'\n\n\ndef load_national_places():\n df = pd.read_csv(\n 'https://www2.census.gov/geo/docs/reference/codes/files/national_places.txt',\n sep=\"|\",\n encoding='ISO-8859-1',\n dtype={'STATEFP': 'str',\n 'PLACEFP': 'str',\n 'FUNCSTAT': 'str'})\n\n df['geoid'] = df.apply(lambda row: geoid(row), axis=1)\n df['name'] = df.PLACENAME.apply(lambda x: ' '.join(x.split(' ')[0:-1]))\n df['type'] = df.PLACENAME.apply(lambda x: x.split(' ')[-1])\n return df\n\n\ndef get_place_id(df, place, state):\n '''\n Helper Function to pct_change_var\n get_place_id(places,'Osage', 'MN')\n '''\n if not state.isnumeric():\n state_fips = us.states.lookup(state).fips\n else:\n state_fips = state\n\n try:\n place = df[(df.PLACENAME.str.contains(place, case=True))\n & (df.STATEFP == state_fips)\n & (df.TYPE.str.contains('County Subdivision') == False)][\n 'PLACEFP'].iloc[0]\n return str(place)\n\n except Exception as e:\n print(e)\n return 'unknown'\n pass\n\n\ndef vintage_or_current_var(r, places, vintage=False):\n ''' Helper Function to pct_change_var\n Find the First Valid Year with a Variable Value\n https://api.census.gov/data/2014/acs1?get=NAME,B02015_009E,B02015_009M&for=state:*\n https://api.census.gov/data/2014/acs1?get=NAME,B01001_001E&for=place:05000&in=state:48\n '''\n found = False\n\n place_id = get_place_id(places, r['city'], r['state'])\n state_fips = us.states.lookup(r['state']).fips\n\n # look for the end or the start year...and should we go backwards or forwards\n if vintage == True:\n start = r['range'][0]\n end = r['range'][-1]\n step = 1\n else:\n start = r['range'][-1]\n end = r['range'][0]\n step = -1 # reverse order\n #print(start, end, step)\n\n # Find a Year with a Valid Value\n for year in range(start, end, step):\n\n res = requests.get(\n '{0}/{1}/{2}?get={3}&for=place:{4}&in=state:{5}&key={6}'.format(\n r['base'], year, r['ds'], r['key'], place_id, state_fips, r[\n 'census_api_key']))\n\n if (res.status_code != 404):\n\n # return Variable (as key) with year and variable value (as vals)\n try:\n return [\n r['city'], r['state'], r['key'], r['meta'], year,\n int(res.json()[1][0])\n ]\n found = True\n except Exception as e:\n #print(e)\n pass\n else:\n found = False\n\n # Give Up: Never Found a Valid Year..Variable might not exist or URL is malformed\n if not found: return np.nan", | |
| "execution_count": 233, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Main Function" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# Main Functon\ndef pct_change_var(r):\n '''Main Function'''\n vintage = vintage_or_current_var(r, places, vintage=False)\n current = vintage_or_current_var(r, places, vintage=True)\n vintage_yr = vintage[4]\n vintage_val = vintage[-1]\n current_yr = current[4]\n current_val = current[-1]\n pct_change = abs((current[-1] - vintage[-1])) / vintage[-1]\n\n return [r['city'], r['state'], pct_change, r['key'], r['meta'],current_yr, current_val, vintage_yr, vintage_val]", | |
| "execution_count": 57, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "---\n## Loading/Cleaning" | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Load Place Fips Dataset (from census url)" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# Load External PLACE Fips (national_places.txt)into a Dataframe...from Census Site\nplaces = load_national_places()\nplaces.sample(1)", | |
| "execution_count": 55, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 55, | |
| "data": { | |
| "text/plain": " STATE STATEFP PLACEFP PLACENAME TYPE FUNCSTAT \\\n23042 NY 36 02297 Antwerp town County Subdivision A \n\n COUNTY geoid name type \n23042 Jefferson County 16000US3602297 Antwerp town ", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>STATE</th>\n <th>STATEFP</th>\n <th>PLACEFP</th>\n <th>PLACENAME</th>\n <th>TYPE</th>\n <th>FUNCSTAT</th>\n <th>COUNTY</th>\n <th>geoid</th>\n <th>name</th>\n <th>type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>23042</th>\n <td>NY</td>\n <td>36</td>\n <td>02297</td>\n <td>Antwerp town</td>\n <td>County Subdivision</td>\n <td>A</td>\n <td>Jefferson County</td>\n <td>16000US3602297</td>\n <td>Antwerp</td>\n <td>town</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Load ACS1 Variable Table (from census)" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# Fairly long load...might want to load the file first rather than directly in pandas :\\\njson = pd.read_json('https://api.census.gov/data/2014/acs1/variables.json')\nacs1_vars = json.variables.apply(pd.Series) # Explode JSON into multiple columns\n\n# After the first json load...query sqlite instead...milliseconds faster ;)\n# con = sqlite3.connect(DB)\n# acs1_vars = pd.read_sql('Select * FROM {0} LIMIT {1}'.format(TABLE,2), con)\n# con.close()", | |
| "execution_count": 235, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Alternate Asc1 Load––using Requests (probably faster)" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# Alternate Loading Works--might be a lot faster\n# resp = requests.get('https://api.census.gov/data/2014/acs1/variables.json')\n# data = resp.json()\n# df = pd.DataFrame(data)\n# acs1_vars_v2 = df.variables.apply(pd.Series) # Explode JSON into multiple columns", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Write Acs1 to SQLite" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "df = acs1_vars.reset_index()[['label', 'concept', 'group']]\n\n# Write the Acs1 Table\ncon = sqlite3.connect(DB)\ndf.to_sql(TABLE, con=con, index=False, if_exists='replace')\ncon.close()\n\n# Database Check\ncon = sqlite3.connect(DB)\npd.read_sql('Select * FROM {0} LIMIT {1}'.format(TABLE,2), con)\ncon.close()", | |
| "execution_count": 234, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 234, | |
| "data": { | |
| "text/plain": " label concept \\\n0 American Indian Area/Alaska Native Area/Hawaii... Selectable Geographies \n1 American Indian Area/Alaska Native Area/Hawaii... Selectable Geographies \n\n group \n0 N/A \n1 N/A ", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>concept</th>\n <th>group</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>American Indian Area/Alaska Native Area/Hawaii...</td>\n <td>Selectable Geographies</td>\n <td>N/A</td>\n </tr>\n <tr>\n <th>1</th>\n <td>American Indian Area/Alaska Native Area/Hawaii...</td>\n <td>Selectable Geographies</td>\n <td>N/A</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "## Find Census Variables that Apply\nthere are...like what...64k of them?! Search around." | |
| }, | |
| { | |
| "metadata": { | |
| "code_folding": [ | |
| 1 | |
| ], | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# Search within label (and index) for Population\nprint('Searching for Population...')\nget_asc_variable(acs1_vars, 'Population', limit=3)\n\nprint(\"Searching for part of Variable...\")\n# Search within label (and index) for Population\nget_asc_variable(acs1_vars, 'B02003', limit =3)", | |
| "execution_count": 221, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": "Searching for Population...\n", | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 221, | |
| "data": { | |
| "text/plain": " label \\\nB02003_002E Population of one race: \nB02003_002M Margin of Error for!!Population of one race: \nB02003_003E Population of one race:!!White alone \n\n concept \nB02003_002E B02003. Detailed Race \nB02003_002M B02003. Detailed Race \nB02003_003E B02003. Detailed Race ", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>concept</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>B02003_002E</th>\n <td>Population of one race:</td>\n <td>B02003. Detailed Race</td>\n </tr>\n <tr>\n <th>B02003_002M</th>\n <td>Margin of Error for!!Population of one race:</td>\n <td>B02003. Detailed Race</td>\n </tr>\n <tr>\n <th>B02003_003E</th>\n <td>Population of one race:!!White alone</td>\n <td>B02003. Detailed Race</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": "Searching for part of Variable...\n", | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 221, | |
| "data": { | |
| "text/plain": " label concept\nB02003_001E Total: B02003. Detailed Race\nB02003_001M Margin of Error for!!Total: B02003. Detailed Race\nB02003_002E Population of one race: B02003. Detailed Race", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>concept</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>B02003_001E</th>\n <td>Total:</td>\n <td>B02003. Detailed Race</td>\n </tr>\n <tr>\n <th>B02003_001M</th>\n <td>Margin of Error for!!Total:</td>\n <td>B02003. Detailed Race</td>\n </tr>\n <tr>\n <th>B02003_002E</th>\n <td>Population of one race:</td>\n <td>B02003. Detailed Race</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "---\n## Analysis\nproof of concept.. <br>My intention is to use this function in an pandas apply function---over a dataframe." | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "#### Get Percentage Change in Total Population for Austin, TX \nIn future, should work for any variable in the acs1, sf1, or acs5<br/>\nacs1 variables on https://api.census.gov/data/2014/acs1/variables.html" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "# First and Last Valid Year Value for the Census Variable (Variable, Year, Value)\nparams = pd.Series({\n 'city': 'Austin',\n 'state': 'TX',\n 'key': 'B01003_001E',\n 'meta': 'Population Percentage Change', #Next rev:Get from asc1_vars\n 'range': [2010, 2018],\n 'ds': 'acs1',\n 'base': 'https://api.census.gov/data',\n 'census_api_key': CENSUS_API_KEY\n})\nresult = pct_change_var(params)\nresult", | |
| "execution_count": 222, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 222, | |
| "data": { | |
| "text/plain": "['Austin',\n 'TX',\n 0.09577287946428571,\n 'B01003_001E',\n 'Population Percentage Change',\n 2012,\n 842595,\n 2015,\n 931840]" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "print('For {0}, {1}, variable {2} ({3}) from years {4}-{5} is {6:.1%}'.format(\n result[0], # Austin\n result[1], # Texas\n result[3], # B01001_001E \n result[4], # Population Percentage Change (Description)\n result[5], # 2012 (Most Vintage Year)\n result[7], # 2015 (Most Curent Year)\n result[2])) # Pop Percentage Change Between Years", | |
| "execution_count": 223, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": "For Austin, TX, variable B01003_001E (Population Percentage Change) from years 2012-2015 is 9.6%\n", | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "_draft": { | |
| "nbviewer_url": "https://gist.github.com/986d806e3b53e12b81f4b086034a369d" | |
| }, | |
| "gist": { | |
| "id": "986d806e3b53e12b81f4b086034a369d", | |
| "data": { | |
| "description": "Census API––Percentage Change by Year––Proof of Concept", | |
| "public": true | |
| } | |
| }, | |
| "kernelspec": { | |
| "name": "conda-env-jupyter-py", | |
| "display_name": "Python [conda env:jupyter]", | |
| "language": "python" | |
| }, | |
| "language_info": { | |
| "name": "python", | |
| "version": "3.6.3", | |
| "mimetype": "text/x-python", | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "pygments_lexer": "ipython3", | |
| "nbconvert_exporter": "python", | |
| "file_extension": ".py" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment