Last active
January 13, 2019 14:03
-
-
Save BBloggsbott/513b30f284d1af910cb63ee5789dff73 to your computer and use it in GitHub Desktop.
Web Scraping tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from bs4 import BeautifulSoup\n", | |
| "import urllib\n", | |
| "import warnings\n", | |
| "warnings.simplefilter('ignore')\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "response = urllib.request.urlopen('https://scrapping-site.herokuapp.com')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "soup = BeautifulSoup(response)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<!DOCTYPE html>\n", | |
| "<html>\n", | |
| "<head>\n", | |
| "<meta charset=\"utf-8\"/>\n", | |
| "<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n", | |
| "<title>Dummy Site</title>\n", | |
| "<meta content=\"width=device-width, initial-scale=1\" name=\"viewport\"/>\n", | |
| "<link href=\"assets/css/main.css\" media=\"screen\" rel=\"stylesheet\" type=\"text/css\"/>\n", | |
| "<script src=\"assets/js/main.js\"></script>\n", | |
| "</head>\n", | |
| "<body>\n", | |
| "<div class=\"login-page\">\n", | |
| "<div class=\"form\">\n", | |
| "<form action=\"cbcs.html\" class=\"login-form\" method=\"POST\" name=\"login-form\" onsubmit=\"return validateLogin()\">\n", | |
| "<input name=\"username\" placeholder=\"username\" type=\"text\"/>\n", | |
| "<input name=\"password\" placeholder=\"password\" type=\"password\"/>\n", | |
| "<button onclick=\"validateLogin()\">login</button>\n", | |
| "</form>\n", | |
| "<a href=\"registered.html\">Registered</a>\n", | |
| "</div>\n", | |
| "</div>\n", | |
| "</body>\n", | |
| "</html>" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "soup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data_link = 'https://scrapping-site.herokuapp.com/'+soup.find_all('a')[0]['href']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "response = urllib.request.urlopen(data_link)\n", | |
| "soup = BeautifulSoup(response)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<!DOCTYPE html>\n", | |
| "<html>\n", | |
| "<head>\n", | |
| "<meta charset=\"utf-8\"/>\n", | |
| "<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n", | |
| "<title>CBCS</title>\n", | |
| "<meta content=\"width=device-width, initial-scale=1\" name=\"viewport\"/>\n", | |
| "<link href=\"main.css\" media=\"screen\" rel=\"stylesheet\" type=\"text/css\"/>\n", | |
| "<script src=\"assets/js/cbcs.js\"></script>\n", | |
| "</head>\n", | |
| "<body>\n", | |
| "<div class=\"Student Data\">\n", | |
| " Name: <span class=\"name\"> Dummy Name</span><br/>\n", | |
| " REGNO: <span class=\"regno\">1234</span><br/>\n", | |
| " DOB: <span class=\"dob\">1234</span><br/>\n", | |
| "</div>\n", | |
| "<style type=\"text/css\">\n", | |
| " table.example2 {\n", | |
| " background-color: transparent;\n", | |
| " border-collapse: collapse;\n", | |
| " width: 100%;\n", | |
| " }\n", | |
| "\n", | |
| " table.example2 th,\n", | |
| " table.example2 td {\n", | |
| " text-align: center;\n", | |
| " border: 1px solid black;\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| "\n", | |
| " table.example2 th {\n", | |
| " background-color: AntiqueWhite;\n", | |
| " }\n", | |
| "\n", | |
| " table.example2 td:first-child {\n", | |
| " width: 20%;\n", | |
| " }\n", | |
| " </style>\n", | |
| "<form action=\"registered.html\" method=\"POST\" name=\"cbcs-form\" onsubmit=\"return cbcsValidate()\">\n", | |
| "<table class=\"example2\">\n", | |
| "<tr>\n", | |
| "<th>Subject</th>\n", | |
| "<th>Faculty</th>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>Data mining</td>\n", | |
| "<td>FacultyData</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>DBMS</td>\n", | |
| "<td>FacultyDBMS</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>Java</td>\n", | |
| "<td>FacultyJava</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>Data Structures</td>\n", | |
| "<td>FacultyDS</td>\n", | |
| "</tr>\n", | |
| "</table>\n", | |
| "</form>\n", | |
| "</body>\n", | |
| "</html>" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "soup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "rows = soup.find_all('tr')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.DataFrame(columns = rows[0].text.split('\\n')[1:3])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "loc = 0\n", | |
| "for data in rows[1:]:\n", | |
| " df.loc[loc] = data.text.split('\\n')[1:3]\n", | |
| " loc+=1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Subject</th>\n", | |
| " <th>Faculty</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>Data mining</td>\n", | |
| " <td>FacultyData</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>DBMS</td>\n", | |
| " <td>FacultyDBMS</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Java</td>\n", | |
| " <td>FacultyJava</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Data Structures</td>\n", | |
| " <td>FacultyDS</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Subject Faculty\n", | |
| "0 Data mining FacultyData\n", | |
| "1 DBMS FacultyDBMS\n", | |
| "2 Java FacultyJava\n", | |
| "3 Data Structures FacultyDS" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "student_data = soup.find_all('div', attrs={'class':'Student Data'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Name: Dummy Name\n", | |
| "REGNO: 1234\n", | |
| "DOB: 1234\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for i in student_data[0].text.split('\\n')[1:-1]:\n", | |
| " print(i.strip())" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.1" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment