Last active
September 1, 2016 11:42
-
-
Save anri-c/098cb7d31cd40ddc09ea96d71bdebabe to your computer and use it in GitHub Desktop.
Selenium, Phantomjs & BeautifulSoup4 ref: http://qiita.com/anri-c/items/0177e21aa3dcff6cca06
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $ cat /etc/lsb-release | |
| DISTRIB_ID=Ubuntu | |
| DISTRIB_RELEASE=16.04 | |
| DISTRIB_CODENAME=xenial | |
| DISTRIB_DESCRIPTION="Ubuntu 16.04 LTS" | |
| $ sudo aptitude install phantomjs xvfb | |
| $ pip install selenium pyvirtualdisplay |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from pyvirtualdisplay import Display | |
| display = Display(visible=0, size=(800, 600)) | |
| display.start() | |
| # <Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', ' - snip - | |
| driver = webdriver.PhantomJS() | |
| driver.get("http://www.example.com) | |
| type(driver.page_source) | |
| # <class 'str'> | |
| driver.page_source | |
| # '<!DOCTYPE html><html itemscope="" itemtype="http://schema.org/Web - snip - | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(driver.page_sourve) | |
| i = [ {"href": x["href"], "text": x.string, "class": x._class } for x in soup.find_all("a") ] | |
| print(i) | |
| # [{'class': None, 'text': 'MENU', 'href': 'javascript:;'}, {'class': None, 'text': 'トップページ', 'href': '/'}, {'class': None, 'text': 'プラットフォーム', 'href': '/pf/'}, - snip - |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from pyvirtualdisplay import Display | |
| display = Display(visible=0, size=(800, 600)) | |
| display.start() | |
| # <Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', ' - snip - | |
| driver = webdriver.PhantomJS() | |
| driver.get("http://www.example.com) | |
| type(driver.page_source) | |
| # <class 'str'> | |
| driver.page_source | |
| # '<!DOCTYPE html><html itemscope="" itemtype="http://schema.org/Web - snip - | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(driver.page_sourve) | |
| i = [ {"href": x["href"], "text": x.string, "class": x._class } for x in soup.find_all("a") ] | |
| print(i) | |
| # [{'class': None, 'text': 'MENU', 'href': 'javascript:;'}, {'class': None, 'text': 'トップページ', 'href': '/'}, {'class': None, 'text': 'プラットフォーム', 'href': '/pf/'}, - snip - |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment