Skip to content

Instantly share code, notes, and snippets.

@craigmbooth
Last active December 25, 2015 14:59
Show Gist options
  • Select an option

  • Save craigmbooth/6994733 to your computer and use it in GitHub Desktop.

Select an option

Save craigmbooth/6994733 to your computer and use it in GitHub Desktop.
ipython notebook that takes the file teams.txt, goes to nfl.com and scrapes the team roster for each time and puts the output into a SQLite database. Also requires file teams.txt, which is also in my github gists.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"from bs4 import BeautifulSoup\n",
"import urllib\n",
"import json\n",
"from collections import namedtuple\n",
"import numpy as np\n",
"import sqlite3"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If necessary, create the database file:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def create_db():\n",
" conn = sqlite3.connect('nfl_teams.db')\n",
" c = conn.cursor()\n",
" c.execute('''CREATE TABLE IF NOT EXISTS players (number int, name text, pos text, height int, weight int, dob text, team text)''')\n",
" conn.commit()\n",
" conn.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Initialize stuff:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Use teams.txt to get the team abbreviations and cities:\n",
"all_city = []\n",
"all_abbr = []\n",
"all_name = []\n",
"with open(\"teams.txt\",\"r\") as f:\n",
"\n",
" for line in f:\n",
" line = line.rstrip()\n",
" line = line.split(\" - \")\n",
" all_city.append(line[1])\n",
" all_abbr.append(line[0].rstrip())\n",
" all_name.append(line[2].title())\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def feet2inches(instr):\n",
" \"\"\" Convert a string in f the form X'Y\" into an integer number of inches\"\"\"\n",
" instr = \"\".join([c for c in instr if c not in ('\"',' ')])\n",
" instr = instr.split(\"'\")\n",
" instr = [int(c) for c in instr]\n",
" return int(12*instr[0] + instr[1])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now loop over teams, extract player stats from the HTML and dump them to nfl_teams.db"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"players = []\n",
"create_db()\n",
"\n",
"url_base = \"http://www.nfl.com/teams/roster?team=\"\n",
"for abbr, city, name in zip(all_abbr,all_city,all_name):\n",
" print \"Parsing team : \",city,name\n",
" url = url_base+abbr\n",
" soup = BeautifulSoup(urllib.urlopen(url) )\n",
" for row in soup.findAll('table')[1].tbody.findAll('tr'):\n",
" try:\n",
" number = int(row.findAll('td')[0].contents[0])\n",
" name = str(row.findAll('td')[1].a.contents[0])\n",
" pos = str(row.findAll('td')[2].contents[0])\n",
" height = feet2inches(str(row.findAll('td')[4].contents[0]))\n",
" weight = int(row.findAll('td')[5].contents[0])\n",
" dob = str(row.findAll('td')[6].contents[0])\n",
" player_record = (number,name,pos,height,weight,dob,abbr)\n",
" conn = sqlite3.connect('nfl_teams.db')\n",
" c = conn.cursor()\n",
" c.execute(\"INSERT INTO players VALUES (?,?,?,?,?,?,?)\",player_record)\n",
" conn.commit()\n",
" conn.close()\n",
" players.append(player_record)\n",
" except:\n",
" pass\n",
" print \" --done.\"\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Seattle Seahawks\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Arizona Cardinals\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Tampa Bay Buccaneers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Washington Redskins\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Carolina Panthers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Jacksonville Jaguars\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Houston Texans\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Parsing team : Atlanta Falcons\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Buffalo Bills\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Chicago Bears\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Cincinnati Bengals\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Cleveland Browns\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Baltimore Ravens\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Dallas Cowboys\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Denver Broncos\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Detroit Lions\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Green Bay Packers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Tennessee Titans\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Indianapolis Colts\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Kansas City Chiefs\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Oakland Raiders\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : St. Louis Rams\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Miami Dolphins\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Minnesota Vikings\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : New England Patriots\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : New Orleans Saints\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : NY Giants Giants\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : NY Jets Jets\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Philadelphia Eagles\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Pittsburgh Steelers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : San Diego Chargers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : San Francisco 49Ers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Seattle Seahawks\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Arizona Cardinals\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Tampa Bay Buccaneers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Washington Redskins\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Carolina Panthers\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Jacksonville Jaguars\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Parsing team : Houston Texans\n",
" --done."
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment