-
-
Save jschombe/6585250 to your computer and use it in GitHub Desktop.
| #yelp cluster finder | |
| import re | |
| import datetime | |
| from datetime import datetime | |
| import collections | |
| from collections import defaultdict | |
| def yelp_find(filename,kw): | |
| open_filename = open (filename, "rU") | |
| kw_list=[] | |
| business_dict={} | |
| for line in open_filename: | |
| if kw in line: | |
| print line | |
| kw_list.append(line) | |
| def yelp_find_busid_and_date(filename,kw): | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall('(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})', open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| business_id = yelpfield[2] | |
| date = yelpfield[0] | |
| print yelpfield[2]+yelpfield[0] | |
| #print result | |
| def yelp_cluster_size(filename, kw): | |
| #"this line filters out all reviews with the defined keyword" | |
| yelpfield_list =[] | |
| business_dict = collections.defaultdict(list) | |
| #yelpfield_list.append(business_string) | |
| #for line in filename: | |
| #if keyword in line: | |
| #keyword_list.append(line) | |
| #print line | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| date = yelpfield[0] | |
| business_dict[businessid].append(date) | |
| business_string = str (business_dict) | |
| date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| cluster_string = str (date_clusters) | |
| Filtered_bizid_and_dates = re.findall(r'(.+)(\"business_id\"\: \"......................\"\})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)', (cluster_string)) | |
| for datefield in Filtered_bizid_and_dates: | |
| bizkey = datefield[1] | |
| date1 =datefield[3] | |
| date2 =datefield[5] | |
| date_object1 = datetime.strptime(date1, ('%Y-%m-%d')) | |
| date_object2 = datetime.strptime(date2, ('%Y-%m-%d')) | |
| date_delta =date_object2-date_object1 | |
| #print datefield[1:12] | |
| #print date1 | |
| #print date2 | |
| #print date3 | |
| print bizkey | |
| print date_delta | |
| def days_between_events(filename, numberdays): | |
| business_dict = collections.defaultdict(list) | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| dates = yelpfield[0] | |
| print yelpfield | |
| #business_dict[businessid].append(dates) | |
| #date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| #for datefield in date_clusters | |
| #datevalues = datefield[1] | |
| #print datevalues | |
| #date_delta = datetime.strptime(dates[0:10],('%Y-%m-%d'))-datetime.strptime(dates[0:10],('%Y-%m-%d')) | |
| #print date_delta | |
| #date_object = datetime.strptime(dates, ('%Y-%m-%d')) | |
| #first_date = date_object[0] | |
| #second_date = date_object[2] | |
| #first_date-second_date | |
| #print date_object | |
| #print datetime.strptime(date_object[0],'%Y-%m-%d') - datetime.strptime(date_object[-1],'%Y-%m-%d') | |
| #print date_object[0].day-date_object[-1].day | |
| #print dates + businessid | |
| #for businessid, dates in business_dict.iteritems(): | |
| #business_dict[businessid]=sorted(dates) | |
| #{businessid:dates for businessid,dates in business_dict.iteritems()if dates[0].day-dates[-1].day<2} | |
| #date_object =datetime.strptime(dates, '(%Y[-/]%m[-/]%d)' | |
| #print date_object | |
| #print result | |
| # | |
| #if __name__ == '__main__': | |
Okay now I get a new error:
Traceback (most recent call last):
File "<pyshell#68>", line 1, in
yelpclusterfinder.days_between_events('C:\Python27\TRUNCATED.txt', 1)
File "yelpclusterfinder.py", line 56, in days_between_events
date_delta = datetime.strptime(dates[0],('%Y-%m-%d'))-datetime.strptime(dates[-1],('%Y-%m-%d'))
File "C:\Python27\Lib_strptime.py", line 325, in _strptime
(data_string, format))
ValueError: time data '2' does not match format '%Y-%m-%d'
How do I format the 2nd date?
It looks like dates[-1] is returning the string '2' instead of what you expect.
First, get the import statements out of the functions and use a different variable name than tuple.
Second, debug step by step, making absolutely sure each line is doing what you expect. You can use unit tests (http://docs.python.org/2/library/unittest.html) if you feel like "doing it right", or just throw print statements everwhere, which is what many people do.
This probably doesn't work as-is, but it's moving in the direction of simpler functions that are easier to debug.
from collections import defaultdict
def parse_date(datestring):
return datetime.strptime(datestring, '%Y-%m-%d')
def parse_events(filename):
openfile = open(filename, "rU")
regex = r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \".{22}\"\})'
event_tuples = re.findall(regex, openfile.read())
return event_tuples
def key_events_on_biz(event_tuples):
event_dictionary = defaultdict(list)
for fields in events:
businessid = fields[2]
date = field[0]
event_dictionary[businessid].append(date)
return event_dictionary
def diff_dates(event_dictionary):
differences = {}
for biz, datelist in event_dictionary.iteritems():
first = parse_date(datelist[0])
last = parse_date(datelist[-1])
delta = last - first
differences[biz] = delta
return differences
if __name__ == "__main__":
event_tuples = parse_events(filename)
event_dictionary = key_events_on_biz(event_tuples)
differences = diff_dates(event_dictionary)
How do I refer to the different dates if not by date_object[0] and date_object[-1]?