Created
November 14, 2016 07:00
-
-
Save dspp779/550da2bdbd1dceafb036022597db5520 to your computer and use it in GitHub Desktop.
crawl data on http://arts-ccr-002.bham.ac.uk/ccr/patgram/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Crawl GRAMMAR PATTERN examples" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "import essential library" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from bs4 import BeautifulSoup\n", | |
| "import requests" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "crawl from http://arts-ccr-002.bham.ac.uk/ccr/patgram/" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "patgramUrl = 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "r = requests.get(patgramUrl)\n", | |
| "soup = BeautifulSoup(r.text, \"html.parser\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "get chapter urls" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from urllib.parse import urljoin\n", | |
| "chapterUrls = [urljoin(patgramUrl, node.attrs['href']) for node in soup.select('ul.toc-main > li > a')]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch01.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch02.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch03.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch04.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch05.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch06.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch07.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch08.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch09.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch10.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch11.html',\n", | |
| " 'http://arts-ccr-002.bham.ac.uk/ccr/patgram/ch12.html']" | |
| ] | |
| }, | |
| "execution_count": 30, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "chapterUrls" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "example" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "r = requests.get(chapterUrls[0])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "The webpage is malformed and ambiguous, you'll need to do something" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "soup = BeautifulSoup(r.text, \"html.parser\")\n", | |
| "examples = soup.find_all(['div', 'div', 'ul'], class_=['hd1', 'hd5', 'example'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[<div class=\"hd1\">1 V</div>,\n", | |
| " <div class=\"hd5\">1 The `move' group</div>,\n", | |
| " <ul class=\"example\">\n", | |
| " <li><span class=\"example\">As they <span class=\"bold\"><span class=\"underline\">advanced</span></span>, the boys beamed their flashlights in every direction.</span></li>\n", | |
| " <li><span class=\"example\">She returned to the cabin. George <span class=\"bold\"><span class=\"underline\">had</span></span> just <span class=\"bold\"><span class=\"bold\">arrived</span></span>.</span></li>\n", | |
| " <li><span class=\"example\">He found a part of the arrow that <span class=\"bold\"><span class=\"underline\">had broken off</span></span>.</span></li>\n", | |
| " <li><span class=\"example\">The Ancients went to bed when the sun <span class=\"bold\"><span class=\"underline\">went down</span></span> and arose as the sun <span class=\"bold\"><span class=\"underline\">rose</span></span>.</span></li>\n", | |
| " <li><span class=\"example\">I don't smoke, so there are no dirty ashtrays <span class=\"bold\"><span class=\"underline\">lying around</span></span> to upset her.</span></li>\n", | |
| " <li><span class=\"example\">The elevator began to <span class=\"bold\"><span class=\"underline\">move</span></span> again, but now it moved up.</span></li>\n", | |
| " <li><span class=\"example\">Now that you've reminded me I guess I <span class=\"bold\"><span class=\"underline\">can stick around</span></span> for four or five days longer.</span></li>\n", | |
| " <li><span class=\"example\">The car slowed and <span class=\"bold\"><span class=\"underline\">stopped</span></span>.</span></li>\n", | |
| " </ul>,\n", | |
| " <ul class=\"example\">\n", | |
| " <li><span class=\"example\">The forces <span class=\"bold\"><span class=\"underline\">will</span></span> then <span class=\"bold\"><span class=\"bold\">spread out</span></span>, securing roads and protecting food convoys to the interior.</span></li>\n", | |
| " </ul>,\n", | |
| " <div class=\"hd5\">2 The `turn' group</div>]" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "examples[:5]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment