MBAustin · November 18, 2016 05:34
diff --git a/gistfile1.txt b/gistfile1.txt
 import re, sys, time
 from PyQt4 import QtCore, QtGui, QtWebKit
 from bs4 import BeautifulSoup

 #comment out this line to see QT warnings:
 # QtCore.qInstallMsgHandler(lambda *args: None)
 # QtWebKit.QWebSettings.PluginsEnabled = True
 debug = True
 finished = False

 class Scraper(QtWebKit.QWebPage):
    global debug
    global finished

    def __init__(self):
        super(Scraper, self).__init__()
        self.mutex = QtCore.QMutex()
        self.finished_mutex = QtCore.QMutex()
        self.finished_mutex.lock()
        self.prefix = 'http://www.lolesports.com'
        self.sUrls = []
        self.s_queue = []
        self.data_urls = []

        self.mainFrame().loadFinished.connect(self.handle_load_finished)

    def start(self, url):
        print('starting')
        self.s_queue.append(url)
        self.fetch_next()
        print('waiting to finish')
        self.finished_mutex.lock()

    def fetch_next(self):
        print('length of queue is {0}'.format(len(self.s_queue)))
        print('number of pages visited: {0}'.format(len(self.sUrls)))

        self.mutex.lock()
        print('locking the mutex')
        self.mainFrame().load(QtCore.QUrl(self.s_queue.pop()))




    def process_current_page(self):
        url = self.frame.url().toString()
        self.sUrls.append(url)
        print('visiting {0}'.format(url))

        html = self.mainFrame().toHtml()
        soup = BeautifulSoup(html, 'html.parser')

        #queue up all data links in a page first, then all matches, then all schedules
        link_tags = soup.findAll('a')
        links = []
        for link in link_tags:
            links.append(link.get('href'))
        for d_link in links:
            if self.check_details(d_link, url):
                # print('found match data for {0}'.format(url.split('/')[8]))
                self.data_urls.append(d_link)
        for m_link in links:
            if self.check_matches(m_link, url):
                self.s_queue.append(self.prefix + m_link)
        for s_link in links:
            if self.check_schedules(s_link, url):
                self.s_queue.append(self.prefix + s_link)


    def check_href(self, h, current_url):
        return h and (('schedule/' in h) or ('matches/' in h) or ('match-details' in h)) and h not in self.sUrls

    def handle_load_finished(self):
        print('in handle_load_finished')
        self.frame = self.mainFrame()
        self.process_current_page()

        self.mutex.unlock()
        print('unlocking the mutex')
        if len(self.s_queue) > 0:
            self.fetch_next()
        print('found {0} match data'.format(len(self.data_urls)))
        self.finished_mutex.unlock()


    def check_details(self, h, current_url):
        return h and 'match-details' in h
    def check_matches(self, h, current_url):
        return h and 'matches/' in h and h not in self.sUrls
    def check_schedules(self, h, current_url):
        return h and 'schedule/' in h and h not in self.sUrls

 if __name__ == '__main__':
    app = QtGui.QApplication(sys.argv)
    scraper = Scraper()
    scraper.start('http://www.lolesports.com/en_US/worlds/world_championship_2016/schedule/default')
    sys.exit(app.exec_())
	import re, sys, time
	from PyQt4 import QtCore, QtGui, QtWebKit
	from bs4 import BeautifulSoup

	#comment out this line to see QT warnings:
	# QtCore.qInstallMsgHandler(lambda *args: None)
	# QtWebKit.QWebSettings.PluginsEnabled = True
	debug = True
	finished = False

	class Scraper(QtWebKit.QWebPage):
	global debug
	global finished

	def __init__(self):
	super(Scraper, self).__init__()
	self.mutex = QtCore.QMutex()
	self.finished_mutex = QtCore.QMutex()
	self.finished_mutex.lock()
	self.prefix = 'http://www.lolesports.com'
	self.sUrls = []
	self.s_queue = []
	self.data_urls = []

	self.mainFrame().loadFinished.connect(self.handle_load_finished)

	def start(self, url):
	print('starting')
	self.s_queue.append(url)
	self.fetch_next()
	print('waiting to finish')
	self.finished_mutex.lock()

	def fetch_next(self):
	print('length of queue is {0}'.format(len(self.s_queue)))
	print('number of pages visited: {0}'.format(len(self.sUrls)))

	self.mutex.lock()
	print('locking the mutex')
	self.mainFrame().load(QtCore.QUrl(self.s_queue.pop()))




	def process_current_page(self):
	url = self.frame.url().toString()
	self.sUrls.append(url)
	print('visiting {0}'.format(url))

	html = self.mainFrame().toHtml()
	soup = BeautifulSoup(html, 'html.parser')

	#queue up all data links in a page first, then all matches, then all schedules
	link_tags = soup.findAll('a')
	links = []
	for link in link_tags:
	links.append(link.get('href'))
	for d_link in links:
	if self.check_details(d_link, url):
	# print('found match data for {0}'.format(url.split('/')[8]))
	self.data_urls.append(d_link)
	for m_link in links:
	if self.check_matches(m_link, url):
	self.s_queue.append(self.prefix + m_link)
	for s_link in links:
	if self.check_schedules(s_link, url):
	self.s_queue.append(self.prefix + s_link)


	def check_href(self, h, current_url):
	return h and (('schedule/' in h) or ('matches/' in h) or ('match-details' in h)) and h not in self.sUrls

	def handle_load_finished(self):
	print('in handle_load_finished')
	self.frame = self.mainFrame()
	self.process_current_page()

	self.mutex.unlock()
	print('unlocking the mutex')
	if len(self.s_queue) > 0:
	self.fetch_next()
	print('found {0} match data'.format(len(self.data_urls)))
	self.finished_mutex.unlock()


	def check_details(self, h, current_url):
	return h and 'match-details' in h
	def check_matches(self, h, current_url):
	return h and 'matches/' in h and h not in self.sUrls
	def check_schedules(self, h, current_url):
	return h and 'schedule/' in h and h not in self.sUrls

	if __name__ == '__main__':
	app = QtGui.QApplication(sys.argv)
	scraper = Scraper()
	scraper.start('http://www.lolesports.com/en_US/worlds/world_championship_2016/schedule/default')
	sys.exit(app.exec_())
No results found