Last active
December 25, 2015 14:59
-
-
Save craigmbooth/6994733 to your computer and use it in GitHub Desktop.
ipython notebook that takes the file teams.txt, goes to nfl.com and scrapes the team roster for each time and puts the output into a SQLite database. Also requires file teams.txt, which is also in my github gists.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "%matplotlib inline\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "import urllib\n", | |
| "import json\n", | |
| "from collections import namedtuple\n", | |
| "import numpy as np\n", | |
| "import sqlite3" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "If necessary, create the database file:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def create_db():\n", | |
| " conn = sqlite3.connect('nfl_teams.db')\n", | |
| " c = conn.cursor()\n", | |
| " c.execute('''CREATE TABLE IF NOT EXISTS players (number int, name text, pos text, height int, weight int, dob text, team text)''')\n", | |
| " conn.commit()\n", | |
| " conn.close()" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Initialize stuff:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "#Use teams.txt to get the team abbreviations and cities:\n", | |
| "all_city = []\n", | |
| "all_abbr = []\n", | |
| "all_name = []\n", | |
| "with open(\"teams.txt\",\"r\") as f:\n", | |
| "\n", | |
| " for line in f:\n", | |
| " line = line.rstrip()\n", | |
| " line = line.split(\" - \")\n", | |
| " all_city.append(line[1])\n", | |
| " all_abbr.append(line[0].rstrip())\n", | |
| " all_name.append(line[2].title())\n" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def feet2inches(instr):\n", | |
| " \"\"\" Convert a string in f the form X'Y\" into an integer number of inches\"\"\"\n", | |
| " instr = \"\".join([c for c in instr if c not in ('\"',' ')])\n", | |
| " instr = instr.split(\"'\")\n", | |
| " instr = [int(c) for c in instr]\n", | |
| " return int(12*instr[0] + instr[1])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Now loop over teams, extract player stats from the HTML and dump them to nfl_teams.db" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "players = []\n", | |
| "create_db()\n", | |
| "\n", | |
| "url_base = \"http://www.nfl.com/teams/roster?team=\"\n", | |
| "for abbr, city, name in zip(all_abbr,all_city,all_name):\n", | |
| " print \"Parsing team : \",city,name\n", | |
| " url = url_base+abbr\n", | |
| " soup = BeautifulSoup(urllib.urlopen(url) )\n", | |
| " for row in soup.findAll('table')[1].tbody.findAll('tr'):\n", | |
| " try:\n", | |
| " number = int(row.findAll('td')[0].contents[0])\n", | |
| " name = str(row.findAll('td')[1].a.contents[0])\n", | |
| " pos = str(row.findAll('td')[2].contents[0])\n", | |
| " height = feet2inches(str(row.findAll('td')[4].contents[0]))\n", | |
| " weight = int(row.findAll('td')[5].contents[0])\n", | |
| " dob = str(row.findAll('td')[6].contents[0])\n", | |
| " player_record = (number,name,pos,height,weight,dob,abbr)\n", | |
| " conn = sqlite3.connect('nfl_teams.db')\n", | |
| " c = conn.cursor()\n", | |
| " c.execute(\"INSERT INTO players VALUES (?,?,?,?,?,?,?)\",player_record)\n", | |
| " conn.commit()\n", | |
| " conn.close()\n", | |
| " players.append(player_record)\n", | |
| " except:\n", | |
| " pass\n", | |
| " print \" --done.\"\n" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Seattle Seahawks\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Arizona Cardinals\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Tampa Bay Buccaneers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Washington Redskins\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Carolina Panthers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Jacksonville Jaguars\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Houston Texans\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Parsing team : Atlanta Falcons\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Buffalo Bills\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Chicago Bears\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Cincinnati Bengals\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Cleveland Browns\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Baltimore Ravens\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Dallas Cowboys\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Denver Broncos\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Detroit Lions\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Green Bay Packers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Tennessee Titans\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Indianapolis Colts\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Kansas City Chiefs\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Oakland Raiders\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : St. Louis Rams\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Miami Dolphins\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Minnesota Vikings\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : New England Patriots\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : New Orleans Saints\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : NY Giants Giants\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : NY Jets Jets\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Philadelphia Eagles\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Pittsburgh Steelers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : San Diego Chargers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : San Francisco 49Ers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Seattle Seahawks\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Arizona Cardinals\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Tampa Bay Buccaneers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Washington Redskins\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Carolina Panthers\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Jacksonville Jaguars\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Parsing team : Houston Texans\n", | |
| " --done." | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment