Last active
November 18, 2016 05:34
-
-
Save MBAustin/d299f0467a3fb05dabc04308767e9afd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re, sys, time | |
| from PyQt4 import QtCore, QtGui, QtWebKit | |
| from bs4 import BeautifulSoup | |
| #comment out this line to see QT warnings: | |
| # QtCore.qInstallMsgHandler(lambda *args: None) | |
| # QtWebKit.QWebSettings.PluginsEnabled = True | |
| debug = True | |
| finished = False | |
| class Scraper(QtWebKit.QWebPage): | |
| global debug | |
| global finished | |
| def __init__(self): | |
| super(Scraper, self).__init__() | |
| self.mutex = QtCore.QMutex() | |
| self.finished_mutex = QtCore.QMutex() | |
| self.finished_mutex.lock() | |
| self.prefix = 'http://www.lolesports.com' | |
| self.sUrls = [] | |
| self.s_queue = [] | |
| self.data_urls = [] | |
| self.mainFrame().loadFinished.connect(self.handle_load_finished) | |
| def start(self, url): | |
| print('starting') | |
| self.s_queue.append(url) | |
| self.fetch_next() | |
| print('waiting to finish') | |
| self.finished_mutex.lock() | |
| def fetch_next(self): | |
| print('length of queue is {0}'.format(len(self.s_queue))) | |
| print('number of pages visited: {0}'.format(len(self.sUrls))) | |
| self.mutex.lock() | |
| print('locking the mutex') | |
| self.mainFrame().load(QtCore.QUrl(self.s_queue.pop())) | |
| def process_current_page(self): | |
| url = self.frame.url().toString() | |
| self.sUrls.append(url) | |
| print('visiting {0}'.format(url)) | |
| html = self.mainFrame().toHtml() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| #queue up all data links in a page first, then all matches, then all schedules | |
| link_tags = soup.findAll('a') | |
| links = [] | |
| for link in link_tags: | |
| links.append(link.get('href')) | |
| for d_link in links: | |
| if self.check_details(d_link, url): | |
| # print('found match data for {0}'.format(url.split('/')[8])) | |
| self.data_urls.append(d_link) | |
| for m_link in links: | |
| if self.check_matches(m_link, url): | |
| self.s_queue.append(self.prefix + m_link) | |
| for s_link in links: | |
| if self.check_schedules(s_link, url): | |
| self.s_queue.append(self.prefix + s_link) | |
| def check_href(self, h, current_url): | |
| return h and (('schedule/' in h) or ('matches/' in h) or ('match-details' in h)) and h not in self.sUrls | |
| def handle_load_finished(self): | |
| print('in handle_load_finished') | |
| self.frame = self.mainFrame() | |
| self.process_current_page() | |
| self.mutex.unlock() | |
| print('unlocking the mutex') | |
| if len(self.s_queue) > 0: | |
| self.fetch_next() | |
| print('found {0} match data'.format(len(self.data_urls))) | |
| self.finished_mutex.unlock() | |
| def check_details(self, h, current_url): | |
| return h and 'match-details' in h | |
| def check_matches(self, h, current_url): | |
| return h and 'matches/' in h and h not in self.sUrls | |
| def check_schedules(self, h, current_url): | |
| return h and 'schedule/' in h and h not in self.sUrls | |
| if __name__ == '__main__': | |
| app = QtGui.QApplication(sys.argv) | |
| scraper = Scraper() | |
| scraper.start('http://www.lolesports.com/en_US/worlds/world_championship_2016/schedule/default') | |
| sys.exit(app.exec_()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment