Skip to content

Instantly share code, notes, and snippets.

@anri-c
Last active September 1, 2016 11:42
Show Gist options
  • Select an option

  • Save anri-c/098cb7d31cd40ddc09ea96d71bdebabe to your computer and use it in GitHub Desktop.

Select an option

Save anri-c/098cb7d31cd40ddc09ea96d71bdebabe to your computer and use it in GitHub Desktop.
Selenium, Phantomjs & BeautifulSoup4 ref: http://qiita.com/anri-c/items/0177e21aa3dcff6cca06
$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04 LTS"
$ sudo aptitude install phantomjs xvfb
$ pip install selenium pyvirtualdisplay
from selenium import webdriver
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 600))
display.start()
# <Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', ' - snip -
driver = webdriver.PhantomJS()
driver.get("http://www.example.com)
type(driver.page_source)
# <class 'str'>
driver.page_source
# '<!DOCTYPE html><html itemscope="" itemtype="http://schema.org/Web - snip -
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_sourve)
i = [ {"href": x["href"], "text": x.string, "class": x._class } for x in soup.find_all("a") ]
print(i)
# [{'class': None, 'text': 'MENU', 'href': 'javascript:;'}, {'class': None, 'text': 'トップページ', 'href': '/'}, {'class': None, 'text': 'プラットフォーム', 'href': '/pf/'}, - snip -
from selenium import webdriver
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 600))
display.start()
# <Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', ' - snip -
driver = webdriver.PhantomJS()
driver.get("http://www.example.com)
type(driver.page_source)
# <class 'str'>
driver.page_source
# '<!DOCTYPE html><html itemscope="" itemtype="http://schema.org/Web - snip -
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_sourve)
i = [ {"href": x["href"], "text": x.string, "class": x._class } for x in soup.find_all("a") ]
print(i)
# [{'class': None, 'text': 'MENU', 'href': 'javascript:;'}, {'class': None, 'text': 'トップページ', 'href': '/'}, {'class': None, 'text': 'プラットフォーム', 'href': '/pf/'}, - snip -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment