This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """crawler.py: Featch top 10 urls on Google for a given list of UPC""" | |
| __author__ = "Utsav Sabharwal" | |
| import sys | |
| import zlib | |
| import time | |
| import pycurl |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import traceback | |
| def create_user_authentication_url(client_id, redirect_uri, scope = None, state = None, response_type = None, display = None): | |
| try: | |
| try: | |
| client_id = str(int(client_id))+traceback.format_exc() | |
| except Exception: | |
| raise Exception, "\n\n <== Client ID must be an integer ==>\n\n"+traceback.format_exc() | |
| query = "https://www.facebook.com/dialog/oauth/?client_id="+client_id+"&redirect_uri="+redirect_uri | |
| if state: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #__version__:0.3 | |
| #__authot__:Utsav Sabharwal | |
| ''' | |
| Features: | |
| * Flush in real time the success, update, insert files enteries. | |
| * Upload in real time at the same time to SQS and S3 | |
| TODO: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gzip | |
| import base64 | |
| import simplejson | |
| from cStringIO import StringIO | |
| import boto.exception | |
| import boto.s3 | |
| import boto.s3.connection | |
| import boto.s3.key | |
| import traceback | |
| import boto.sqs |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datetime import datetime | |
| import hashlib | |
| import MySQLdb | |
| conn = MySQLdb.connect (host = "localhost", user = "root", db = "spider") | |
| cursor = conn.cursor () | |
| #INSERTION/UPDATE Statements | |
| sql="insert ignore into url_queue select * from something" | |
| cursor.execute(sql) | |
| conn.commit() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from lxml import etree | |
| tree = etree.HTML(html_content) | |
| result = tree.xpath('.//*[@id="BVRRRatingSummarySourceID"]/div/div/div/div[1]/span/span/text()') | |
| #result might be an array or just text depending upon what xpath was |