Skip to content

Instantly share code, notes, and snippets.

@jschombe
Last active December 23, 2015 05:09
Show Gist options
  • Select an option

  • Save jschombe/6585250 to your computer and use it in GitHub Desktop.

Select an option

Save jschombe/6585250 to your computer and use it in GitHub Desktop.
Find number of days between yelp keyword event
#yelp cluster finder
import re
import datetime
from datetime import datetime
import collections
from collections import defaultdict
def yelp_find(filename,kw):
open_filename = open (filename, "rU")
kw_list=[]
business_dict={}
for line in open_filename:
if kw in line:
print line
kw_list.append(line)
def yelp_find_busid_and_date(filename,kw):
open_filename = open (filename, "rU")
yelp_business_id_and_dates = re.findall('(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})', open_filename.read())
for yelpfield in yelp_business_id_and_dates:
business_id = yelpfield[2]
date = yelpfield[0]
print yelpfield[2]+yelpfield[0]
#print result
def yelp_cluster_size(filename, kw):
#"this line filters out all reviews with the defined keyword"
yelpfield_list =[]
business_dict = collections.defaultdict(list)
#yelpfield_list.append(business_string)
#for line in filename:
#if keyword in line:
#keyword_list.append(line)
#print line
open_filename = open (filename, "rU")
yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read())
for yelpfield in yelp_business_id_and_dates:
businessid = yelpfield[2]
date = yelpfield[0]
business_dict[businessid].append(date)
business_string = str (business_dict)
date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1)
cluster_string = str (date_clusters)
Filtered_bizid_and_dates = re.findall(r'(.+)(\"business_id\"\: \"......................\"\})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)', (cluster_string))
for datefield in Filtered_bizid_and_dates:
bizkey = datefield[1]
date1 =datefield[3]
date2 =datefield[5]
date_object1 = datetime.strptime(date1, ('%Y-%m-%d'))
date_object2 = datetime.strptime(date2, ('%Y-%m-%d'))
date_delta =date_object2-date_object1
#print datefield[1:12]
#print date1
#print date2
#print date3
print bizkey
print date_delta
def days_between_events(filename, numberdays):
business_dict = collections.defaultdict(list)
open_filename = open (filename, "rU")
yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read())
for yelpfield in yelp_business_id_and_dates:
businessid = yelpfield[2]
dates = yelpfield[0]
print yelpfield
#business_dict[businessid].append(dates)
#date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1)
#for datefield in date_clusters
#datevalues = datefield[1]
#print datevalues
#date_delta = datetime.strptime(dates[0:10],('%Y-%m-%d'))-datetime.strptime(dates[0:10],('%Y-%m-%d'))
#print date_delta
#date_object = datetime.strptime(dates, ('%Y-%m-%d'))
#first_date = date_object[0]
#second_date = date_object[2]
#first_date-second_date
#print date_object
#print datetime.strptime(date_object[0],'%Y-%m-%d') - datetime.strptime(date_object[-1],'%Y-%m-%d')
#print date_object[0].day-date_object[-1].day
#print dates + businessid
#for businessid, dates in business_dict.iteritems():
#business_dict[businessid]=sorted(dates)
#{businessid:dates for businessid,dates in business_dict.iteritems()if dates[0].day-dates[-1].day<2}
#date_object =datetime.strptime(dates, '(%Y[-/]%m[-/]%d)'
#print date_object
#print result
#
#if __name__ == '__main__':
@selik
Copy link

selik commented Sep 25, 2013

I suggest not using a reserved word like tuple as a variable name. It can be confusing.

@selik
Copy link

selik commented Sep 25, 2013

The no attribute __getitem__ error comes from trying to get an element from a datetime object as if it were a list object: date_object[0].

@jschombe
Copy link
Author

How do I refer to the different dates if not by date_object[0] and date_object[-1]?

@jschombe
Copy link
Author

Okay now I get a new error:
Traceback (most recent call last):
File "<pyshell#68>", line 1, in
yelpclusterfinder.days_between_events('C:\Python27\TRUNCATED.txt', 1)
File "yelpclusterfinder.py", line 56, in days_between_events
date_delta = datetime.strptime(dates[0],('%Y-%m-%d'))-datetime.strptime(dates[-1],('%Y-%m-%d'))
File "C:\Python27\Lib_strptime.py", line 325, in _strptime
(data_string, format))
ValueError: time data '2' does not match format '%Y-%m-%d'
How do I format the 2nd date?

@selik
Copy link

selik commented Sep 26, 2013

It looks like dates[-1] is returning the string '2' instead of what you expect.

First, get the import statements out of the functions and use a different variable name than tuple.
Second, debug step by step, making absolutely sure each line is doing what you expect. You can use unit tests (http://docs.python.org/2/library/unittest.html) if you feel like "doing it right", or just throw print statements everwhere, which is what many people do.

@selik
Copy link

selik commented Sep 30, 2013

This probably doesn't work as-is, but it's moving in the direction of simpler functions that are easier to debug.

from collections import defaultdict

def parse_date(datestring):
    return datetime.strptime(datestring, '%Y-%m-%d')

def parse_events(filename):
    openfile = open(filename, "rU")
    regex = r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \".{22}\"\})'
    event_tuples = re.findall(regex, openfile.read())
    return event_tuples

def key_events_on_biz(event_tuples):
    event_dictionary = defaultdict(list)
    for fields in events:
        businessid = fields[2]
        date = field[0]
        event_dictionary[businessid].append(date)
    return event_dictionary

def diff_dates(event_dictionary):
    differences = {}
    for biz, datelist in event_dictionary.iteritems():
        first = parse_date(datelist[0])
        last = parse_date(datelist[-1])
        delta = last - first
        differences[biz] = delta
    return differences

if __name__ == "__main__":
    event_tuples = parse_events(filename)
    event_dictionary = key_events_on_biz(event_tuples)
    differences = diff_dates(event_dictionary)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment