Skip to content

Instantly share code, notes, and snippets.

@utsavsabharwal
utsavsabharwal / crawler.py
Created September 4, 2012 08:57
Featch top 10 urls on Google for a given list of UPC
#!/usr/bin/env python
"""crawler.py: Featch top 10 urls on Google for a given list of UPC"""
__author__ = "Utsav Sabharwal"
import sys
import zlib
import time
import pycurl
@utsavsabharwal
utsavsabharwal / fb.py
Created August 24, 2012 10:16
Facebook Graph API Authentication
import traceback
def create_user_authentication_url(client_id, redirect_uri, scope = None, state = None, response_type = None, display = None):
try:
try:
client_id = str(int(client_id))+traceback.format_exc()
except Exception:
raise Exception, "\n\n <== Client ID must be an integer ==>\n\n"+traceback.format_exc()
query = "https://www.facebook.com/dialog/oauth/?client_id="+client_id+"&redirect_uri="+redirect_uri
if state:
@utsavsabharwal
utsavsabharwal / crawler.py
Created August 20, 2012 10:10
Web Crawler
#__version__:0.3
#__authot__:Utsav Sabharwal
'''
Features:
* Flush in real time the success, update, insert files enteries.
* Upload in real time at the same time to SQS and S3
TODO:
@utsavsabharwal
utsavsabharwal / sqs_count.py
Created August 20, 2012 05:35
Get SQS Count
import gzip
import base64
import simplejson
from cStringIO import StringIO
import boto.exception
import boto.s3
import boto.s3.connection
import boto.s3.key
import traceback
import boto.sqs
@utsavsabharwal
utsavsabharwal / mysql-python-api-example.py
Created August 17, 2012 05:28
MySQL Python API Example
from datetime import datetime
import hashlib
import MySQLdb
conn = MySQLdb.connect (host = "localhost", user = "root", db = "spider")
cursor = conn.cursor ()
#INSERTION/UPDATE Statements
sql="insert ignore into url_queue select * from something"
cursor.execute(sql)
conn.commit()
@utsavsabharwal
utsavsabharwal / python xpath example
Created July 30, 2012 04:02
How to fetch information from a html page using xpath in python
from lxml import etree
tree = etree.HTML(html_content)
result = tree.xpath('.//*[@id="BVRRRatingSummarySourceID"]/div/div/div/div[1]/span/span/text()')
#result might be an array or just text depending upon what xpath was