MBAustin · November 17, 2016 21:23
diff --git a/scraper.py b/scraper.py
 import re, sys, time
 from PyQt4.QtGui import *
 from PyQt4.QtCore import *
 from PyQt4.QtWebKit import *
 from bs4 import BeautifulSoup
 import threading


 mutex = QMutex()
 lock = threading.RLock()




 class Render(QWebPage):
    def __init__(self, url):
        # mutex.lock()
        # lock.acquire()
        QWebPage.__init__(self)

        self.url = url
        self.finished = False

        self.loadFinished.connect(self._loadFinished)

        self.mainFrame().load(QUrl(url))



    def _loadFinished(self, result):

        self.frame = self.mainFrame()

        self.finished = True
        
        


 debug = True
 #comment out this line to see QT warnings:
 qInstallMsgHandler(lambda *args: None) 
 run_num = 1
 seen_urls = []
 def scrape_tournament(url, app, r_depth):
    global seen_urls
    global run_num 
    if debug:
        print('run ' + str(run_num) + ' at depth ' + str(r_depth))
        run_num += 1
        print('visiting ' + url)

    tournament_name = url.split('/')[5]

    matches = set()
    if not tournament_name in url:
        matches.add('null url')
        return matches

    if app is None:
        app = QApplication(sys.argv)

    # app.exec_()
    lock.acquire()
    r = Render(url)

    while not r.finished:
        app.processEvents()
        time.sleep(0.01)
    soup = BeautifulSoup(r.frame.toHtml(), 'html.parser')
    lock.release()

    links = soup.find_all('a')
    hrefs = []
    for link in links:
        hrefs.append(link.get('href'))

    #three distinct for loops avoid unnecessary recursion

    for href in hrefs:
        if href and 'match-details' in href and href not in url and href not in seen_urls:
            seen_urls.append(href)
            if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8])
            matches.add(href)

    for href in hrefs:
        if href and 'matches/' in href and href not in url and href not in seen_urls:
            seen_urls.append(href)
            if debug: print('found a series or match: ' + href)
            recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
            matches.update(recursive_scrape)


    for href in hrefs:
        if href and 'schedule/' in href and href not in url and href not in seen_urls:
            seen_urls.append(href)
            recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
            matches.update(recursive_scrape)


    return matches
    


 if __name__ == '__main__':
    app = QApplication(sys.argv)
    match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", app, 0)
    for match in match_list:
        print(match)
    print('there were ' + str(len(match_list)) + ' in total')
	import re, sys, time
	from PyQt4.QtGui import *
	from PyQt4.QtCore import *
	from PyQt4.QtWebKit import *
	from bs4 import BeautifulSoup
	import threading


	mutex = QMutex()
	lock = threading.RLock()




	class Render(QWebPage):
	def __init__(self, url):
	# mutex.lock()
	# lock.acquire()
	QWebPage.__init__(self)

	self.url = url
	self.finished = False

	self.loadFinished.connect(self._loadFinished)

	self.mainFrame().load(QUrl(url))



	def _loadFinished(self, result):

	self.frame = self.mainFrame()

	self.finished = True




	debug = True
	#comment out this line to see QT warnings:
	qInstallMsgHandler(lambda *args: None)
	run_num = 1
	seen_urls = []
	def scrape_tournament(url, app, r_depth):
	global seen_urls
	global run_num
	if debug:
	print('run ' + str(run_num) + ' at depth ' + str(r_depth))
	run_num += 1
	print('visiting ' + url)

	tournament_name = url.split('/')[5]

	matches = set()
	if not tournament_name in url:
	matches.add('null url')
	return matches

	if app is None:
	app = QApplication(sys.argv)

	# app.exec_()
	lock.acquire()
	r = Render(url)

	while not r.finished:
	app.processEvents()
	time.sleep(0.01)
	soup = BeautifulSoup(r.frame.toHtml(), 'html.parser')
	lock.release()

	links = soup.find_all('a')
	hrefs = []
	for link in links:
	hrefs.append(link.get('href'))

	#three distinct for loops avoid unnecessary recursion

	for href in hrefs:
	if href and 'match-details' in href and href not in url and href not in seen_urls:
	seen_urls.append(href)
	if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8])
	matches.add(href)

	for href in hrefs:
	if href and 'matches/' in href and href not in url and href not in seen_urls:
	seen_urls.append(href)
	if debug: print('found a series or match: ' + href)
	recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
	matches.update(recursive_scrape)


	for href in hrefs:
	if href and 'schedule/' in href and href not in url and href not in seen_urls:
	seen_urls.append(href)
	recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, app, r_depth + 1)
	matches.update(recursive_scrape)


	return matches



	if __name__ == '__main__':
	app = QApplication(sys.argv)
	match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", app, 0)
	for match in match_list:
	print(match)
	print('there were ' + str(len(match_list)) + ' in total')
No results found