Skip to content

Instantly share code, notes, and snippets.

@MBAustin
Last active November 18, 2016 05:34
Show Gist options
  • Select an option

  • Save MBAustin/7bbb8c330afe739940b35d344fe2d015 to your computer and use it in GitHub Desktop.

Select an option

Save MBAustin/7bbb8c330afe739940b35d344fe2d015 to your computer and use it in GitHub Desktop.
My Scraper
import re, sys, time
from PyQt4 import QtCore, QtGui, QtWebKit
from bs4 import BeautifulSoup
#comment out this line to see QT warnings:
# QtCore.qInstallMsgHandler(lambda *args: None)
# QtWebKit.QWebSettings.PluginsEnabled = True
debug = True
finished = False
class Scraper(QtWebKit.QWebPage):
global debug
global finished
def __init__(self):
super(Scraper, self).__init__()
self.mutex = QtCore.QMutex()
self.finished_mutex = QtCore.QMutex()
self.finished_mutex.lock()
self.prefix = 'http://www.lolesports.com'
self.sUrls = []
self.s_queue = []
self.data_urls = []
self.mainFrame().loadFinished.connect(self.handle_load_finished)
def start(self, url):
print('starting')
self.s_queue.append(url)
self.fetch_next()
print('waiting to finish')
self.finished_mutex.lock()
def fetch_next(self):
print('length of queue is {0}'.format(len(self.s_queue)))
print('number of pages visited: {0}'.format(len(self.sUrls)))
self.mutex.lock()
print('locking the mutex')
self.mainFrame().load(QtCore.QUrl(self.s_queue.pop()))
def process_current_page(self):
url = self.frame.url().toString()
self.sUrls.append(url)
print('visiting {0}'.format(url))
html = self.mainFrame().toHtml()
soup = BeautifulSoup(html, 'html.parser')
#queue up all data links in a page first, then all matches, then all schedules
link_tags = soup.findAll('a')
links = []
for link in link_tags:
links.append(link.get('href'))
for d_link in links:
if self.check_details(d_link, url):
# print('found match data for {0}'.format(url.split('/')[8]))
self.data_urls.append(d_link)
for m_link in links:
if self.check_matches(m_link, url):
self.s_queue.append(self.prefix + m_link)
for s_link in links:
if self.check_schedules(s_link, url):
self.s_queue.append(self.prefix + s_link)
def check_href(self, h, current_url):
return h and (('schedule/' in h) or ('matches/' in h) or ('match-details' in h)) and h not in self.sUrls
def handle_load_finished(self):
print('in handle_load_finished')
self.frame = self.mainFrame()
self.process_current_page()
self.mutex.unlock()
print('unlocking the mutex')
if len(self.s_queue) > 0:
self.fetch_next()
print('found {0} match data'.format(len(self.data_urls)))
self.finished_mutex.unlock()
def check_details(self, h, current_url):
return h and 'match-details' in h
def check_matches(self, h, current_url):
return h and 'matches/' in h and h not in self.sUrls
def check_schedules(self, h, current_url):
return h and 'schedule/' in h and h not in self.sUrls
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
scraper = Scraper()
scraper.start('http://www.lolesports.com/en_US/worlds/world_championship_2016/schedule/default')
sys.exit(app.exec_())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment