Skip to content

Instantly share code, notes, and snippets.

@BBloggsbott
Last active January 13, 2019 14:03
Show Gist options
  • Select an option

  • Save BBloggsbott/513b30f284d1af910cb63ee5789dff73 to your computer and use it in GitHub Desktop.

Select an option

Save BBloggsbott/513b30f284d1af910cb63ee5789dff73 to your computer and use it in GitHub Desktop.
Web Scraping tutorial
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import urllib\n",
"import warnings\n",
"warnings.simplefilter('ignore')\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"response = urllib.request.urlopen('https://scrapping-site.herokuapp.com')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup(response)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
"<meta charset=\"utf-8\"/>\n",
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
"<title>Dummy Site</title>\n",
"<meta content=\"width=device-width, initial-scale=1\" name=\"viewport\"/>\n",
"<link href=\"assets/css/main.css\" media=\"screen\" rel=\"stylesheet\" type=\"text/css\"/>\n",
"<script src=\"assets/js/main.js\"></script>\n",
"</head>\n",
"<body>\n",
"<div class=\"login-page\">\n",
"<div class=\"form\">\n",
"<form action=\"cbcs.html\" class=\"login-form\" method=\"POST\" name=\"login-form\" onsubmit=\"return validateLogin()\">\n",
"<input name=\"username\" placeholder=\"username\" type=\"text\"/>\n",
"<input name=\"password\" placeholder=\"password\" type=\"password\"/>\n",
"<button onclick=\"validateLogin()\">login</button>\n",
"</form>\n",
"<a href=\"registered.html\">Registered</a>\n",
"</div>\n",
"</div>\n",
"</body>\n",
"</html>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data_link = 'https://scrapping-site.herokuapp.com/'+soup.find_all('a')[0]['href']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"response = urllib.request.urlopen(data_link)\n",
"soup = BeautifulSoup(response)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
"<meta charset=\"utf-8\"/>\n",
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
"<title>CBCS</title>\n",
"<meta content=\"width=device-width, initial-scale=1\" name=\"viewport\"/>\n",
"<link href=\"main.css\" media=\"screen\" rel=\"stylesheet\" type=\"text/css\"/>\n",
"<script src=\"assets/js/cbcs.js\"></script>\n",
"</head>\n",
"<body>\n",
"<div class=\"Student Data\">\n",
" Name: <span class=\"name\"> Dummy Name</span><br/>\n",
" REGNO: <span class=\"regno\">1234</span><br/>\n",
" DOB: <span class=\"dob\">1234</span><br/>\n",
"</div>\n",
"<style type=\"text/css\">\n",
" table.example2 {\n",
" background-color: transparent;\n",
" border-collapse: collapse;\n",
" width: 100%;\n",
" }\n",
"\n",
" table.example2 th,\n",
" table.example2 td {\n",
" text-align: center;\n",
" border: 1px solid black;\n",
" padding: 5px;\n",
" }\n",
"\n",
" table.example2 th {\n",
" background-color: AntiqueWhite;\n",
" }\n",
"\n",
" table.example2 td:first-child {\n",
" width: 20%;\n",
" }\n",
" </style>\n",
"<form action=\"registered.html\" method=\"POST\" name=\"cbcs-form\" onsubmit=\"return cbcsValidate()\">\n",
"<table class=\"example2\">\n",
"<tr>\n",
"<th>Subject</th>\n",
"<th>Faculty</th>\n",
"</tr>\n",
"<tr>\n",
"<td>Data mining</td>\n",
"<td>FacultyData</td>\n",
"</tr>\n",
"<tr>\n",
"<td>DBMS</td>\n",
"<td>FacultyDBMS</td>\n",
"</tr>\n",
"<tr>\n",
"<td>Java</td>\n",
"<td>FacultyJava</td>\n",
"</tr>\n",
"<tr>\n",
"<td>Data Structures</td>\n",
"<td>FacultyDS</td>\n",
"</tr>\n",
"</table>\n",
"</form>\n",
"</body>\n",
"</html>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"rows = soup.find_all('tr')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(columns = rows[0].text.split('\\n')[1:3])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"loc = 0\n",
"for data in rows[1:]:\n",
" df.loc[loc] = data.text.split('\\n')[1:3]\n",
" loc+=1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Subject</th>\n",
" <th>Faculty</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Data mining</td>\n",
" <td>FacultyData</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>DBMS</td>\n",
" <td>FacultyDBMS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Java</td>\n",
" <td>FacultyJava</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Data Structures</td>\n",
" <td>FacultyDS</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Subject Faculty\n",
"0 Data mining FacultyData\n",
"1 DBMS FacultyDBMS\n",
"2 Java FacultyJava\n",
"3 Data Structures FacultyDS"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"student_data = soup.find_all('div', attrs={'class':'Student Data'})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Name: Dummy Name\n",
"REGNO: 1234\n",
"DOB: 1234\n"
]
}
],
"source": [
"for i in student_data[0].text.split('\\n')[1:-1]:\n",
" print(i.strip())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment