Created
November 17, 2016 21:23
-
-
Save MBAustin/792610979fb45c7b288fdcc471c85b62 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re, sys, time | |
| from PyQt4.QtGui import * | |
| from PyQt4.QtCore import * | |
| from PyQt4.QtWebKit import * | |
| from bs4 import BeautifulSoup | |
| import threading | |
| mutex = QMutex() | |
| lock = threading.RLock() | |
| class Render(QWebPage): | |
| def __init__(self, url): | |
| # mutex.lock() | |
| # lock.acquire() | |
| QWebPage.__init__(self) | |
| self.url = url | |
| self.finished = False | |
| self.loadFinished.connect(self._loadFinished) | |
| self.mainFrame().load(QUrl(url)) | |
| def _loadFinished(self, result): | |
| self.frame = self.mainFrame() | |
| self.finished = True | |
| debug = True | |
| #comment out this line to see QT warnings: | |
| qInstallMsgHandler(lambda *args: None) | |
| run_num = 1 | |
| seen_urls = [] | |
| def scrape_tournament(url, app, r_depth): | |
| global seen_urls | |
| global run_num | |
| if debug: | |
| print('run ' + str(run_num) + ' at depth ' + str(r_depth)) | |
| run_num += 1 | |
| print('visiting ' + url) | |
| tournament_name = url.split('/')[5] | |
| matches = set() | |
| if not tournament_name in url: | |
| matches.add('null url') | |
| return matches | |
| if app is None: | |
| app = QApplication(sys.argv) | |
| # app.exec_() | |
| lock.acquire() | |
| r = Render(url) | |
| while not r.finished: | |
| app.processEvents() | |
| time.sleep(0.01) | |
| soup = BeautifulSoup(r.frame.toHtml(), 'html.parser') | |
| lock.release() | |
| links = soup.find_all('a') | |
| hrefs = [] | |
| for link in links: | |
| hrefs.append(link.get('href')) | |
| #three distinct for loops avoid unnecessary recursion | |
| for href in hrefs: | |
| if href and 'match-details' in href and href not in url and href not in seen_urls: | |
| seen_urls.append(href) | |
| if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8]) | |
| matches.add(href) | |
| for href in hrefs: | |
| if href and 'matches/' in href and href not in url and href not in seen_urls: | |
| seen_urls.append(href) | |
| if debug: print('found a series or match: ' + href) | |
| recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1) | |
| matches.update(recursive_scrape) | |
| for href in hrefs: | |
| if href and 'schedule/' in href and href not in url and href not in seen_urls: | |
| seen_urls.append(href) | |
| recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1) | |
| matches.update(recursive_scrape) | |
| return matches | |
| if __name__ == '__main__': | |
| app = QApplication(sys.argv) | |
| match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", app, 0) | |
| for match in match_list: | |
| print(match) | |
| print('there were ' + str(len(match_list)) + ' in total') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment