Skip to content

Instantly share code, notes, and snippets.

@MBAustin
Created November 17, 2016 21:23
Show Gist options
  • Select an option

  • Save MBAustin/792610979fb45c7b288fdcc471c85b62 to your computer and use it in GitHub Desktop.

Select an option

Save MBAustin/792610979fb45c7b288fdcc471c85b62 to your computer and use it in GitHub Desktop.
import re, sys, time
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
import threading
mutex = QMutex()
lock = threading.RLock()
class Render(QWebPage):
def __init__(self, url):
# mutex.lock()
# lock.acquire()
QWebPage.__init__(self)
self.url = url
self.finished = False
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.finished = True
debug = True
#comment out this line to see QT warnings:
qInstallMsgHandler(lambda *args: None)
run_num = 1
seen_urls = []
def scrape_tournament(url, app, r_depth):
global seen_urls
global run_num
if debug:
print('run ' + str(run_num) + ' at depth ' + str(r_depth))
run_num += 1
print('visiting ' + url)
tournament_name = url.split('/')[5]
matches = set()
if not tournament_name in url:
matches.add('null url')
return matches
if app is None:
app = QApplication(sys.argv)
# app.exec_()
lock.acquire()
r = Render(url)
while not r.finished:
app.processEvents()
time.sleep(0.01)
soup = BeautifulSoup(r.frame.toHtml(), 'html.parser')
lock.release()
links = soup.find_all('a')
hrefs = []
for link in links:
hrefs.append(link.get('href'))
#three distinct for loops avoid unnecessary recursion
for href in hrefs:
if href and 'match-details' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8])
matches.add(href)
for href in hrefs:
if href and 'matches/' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
if debug: print('found a series or match: ' + href)
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
matches.update(recursive_scrape)
for href in hrefs:
if href and 'schedule/' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
matches.update(recursive_scrape)
return matches
if __name__ == '__main__':
app = QApplication(sys.argv)
match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", app, 0)
for match in match_list:
print(match)
print('there were ' + str(len(match_list)) + ' in total')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment