Created
August 20, 2012 10:10
-
-
Save utsavsabharwal/3402839 to your computer and use it in GitHub Desktop.
Web Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #__version__:0.3 | |
| #__authot__:Utsav Sabharwal | |
| ''' | |
| Features: | |
| * Flush in real time the success, update, insert files enteries. | |
| * Upload in real time at the same time to SQS and S3 | |
| TODO: | |
| Speed Test | |
| Change the way db updates. | |
| ''' | |
| import pycurl | |
| import thread | |
| import upload as u | |
| import traceback | |
| import time | |
| import config as cc | |
| from threading import Thread | |
| success=open("success.log","w+") #list of uids with urls for which we need to set last_fetch=now() and next_fetch="2012-12-12" | |
| failure=open("failure.log","w+") #list of uids with urls for which we need to set last_fetch=now() and is_disabled=1 | |
| update=open("update.log","w+") | |
| insert=open("insert.log","w+") | |
| invalid_domains=[] | |
| thread_started=[] | |
| urls=open("urls").readlines() | |
| class web_page: | |
| def __init__(self): | |
| self.contents = '' | |
| def body_callback(self, buf): | |
| self.contents = self.contents + buf | |
| class Crawler(Thread): | |
| def __init__(self, id): | |
| Thread.__init__(self) | |
| self.crawl() | |
| def crawl(self): | |
| try: | |
| thread_started.append("ok") | |
| try: | |
| domain, filename, url, product_id = urls.pop().split("\t") | |
| domain = domain.strip() | |
| t = web_page() | |
| if domain not in invalid_domains: | |
| filename=int(filename.strip()) | |
| url=str(url.strip()) | |
| product_id=int(product_id.strip()) | |
| c = pycurl.Curl() | |
| c.setopt(pycurl.FOLLOWLOCATION, 1) | |
| c.setopt(pycurl.MAXREDIRS, 5) | |
| c.setopt(pycurl.CONNECTTIMEOUT, 30) | |
| c.setopt(pycurl.TIMEOUT, 300) | |
| c.setopt(pycurl.NOSIGNAL, 1) | |
| c.setopt(pycurl.URL, url) | |
| c.setopt(c.WRITEFUNCTION, t.body_callback) | |
| c.perform() | |
| original_url = str(url) | |
| final_url = str(c.getinfo(pycurl.EFFECTIVE_URL)) | |
| if not c.errstr(): | |
| if(original_url==final_url): | |
| success.write(str(filename)+chr(10)) | |
| success.flush() | |
| else: | |
| update.write(str(filename)+":::"+final_url+chr(10)) | |
| update.flush() | |
| insert.write(str(original_url)+":::"+str(product_id)+chr(10)) | |
| insert.flush() | |
| else: | |
| print "oye" | |
| response_code = str(c.getinfo(pycurl.HTTP_CODE)) | |
| pattern = filename+":::"+response_code+chr(10) | |
| print "failure", pattern | |
| failure.write(pattern) | |
| failure.flush() | |
| else: | |
| failure.write(str(filename)+chr(10)) | |
| failure.flush() | |
| u.uploader(str(filename), t.contents) | |
| except Exception, ex: | |
| failure.write(str(filename)+chr(10)) | |
| print "Crawler Failure:", traceback.format_exc() | |
| if(ex[0]==6): | |
| invalid_domains.append(domain) | |
| pass | |
| try: | |
| thread_started.pop() | |
| except Exception, ex: | |
| print "Error:", ex | |
| pass | |
| except Exception, ex: | |
| print ex | |
| def run(pid, *args): | |
| print "Core thread", pid | |
| while True: | |
| t=Crawler(cc.id) | |
| t.start() | |
| t.join() | |
| cc.id+=1 | |
| x=0 | |
| while x<100: | |
| th="thread no:"+str(x) | |
| thread.start_new_thread(run,(th,2)) | |
| x+=1 | |
| while len(urls) > 1000: | |
| time.sleep(10) | |
| pass | |
| print "O got out of the loop", len(urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment