Last active
December 23, 2015 05:09
-
-
Save jschombe/6585250 to your computer and use it in GitHub Desktop.
Find number of days between yelp keyword event
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #yelp cluster finder | |
| import re | |
| import datetime | |
| from datetime import datetime | |
| import collections | |
| from collections import defaultdict | |
| def yelp_find(filename,kw): | |
| open_filename = open (filename, "rU") | |
| kw_list=[] | |
| business_dict={} | |
| for line in open_filename: | |
| if kw in line: | |
| print line | |
| kw_list.append(line) | |
| def yelp_find_busid_and_date(filename,kw): | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall('(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})', open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| business_id = yelpfield[2] | |
| date = yelpfield[0] | |
| print yelpfield[2]+yelpfield[0] | |
| #print result | |
| def yelp_cluster_size(filename, kw): | |
| #"this line filters out all reviews with the defined keyword" | |
| yelpfield_list =[] | |
| business_dict = collections.defaultdict(list) | |
| #yelpfield_list.append(business_string) | |
| #for line in filename: | |
| #if keyword in line: | |
| #keyword_list.append(line) | |
| #print line | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| date = yelpfield[0] | |
| business_dict[businessid].append(date) | |
| business_string = str (business_dict) | |
| date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| cluster_string = str (date_clusters) | |
| Filtered_bizid_and_dates = re.findall(r'(.+)(\"business_id\"\: \"......................\"\})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)', (cluster_string)) | |
| for datefield in Filtered_bizid_and_dates: | |
| bizkey = datefield[1] | |
| date1 =datefield[3] | |
| date2 =datefield[5] | |
| date_object1 = datetime.strptime(date1, ('%Y-%m-%d')) | |
| date_object2 = datetime.strptime(date2, ('%Y-%m-%d')) | |
| date_delta =date_object2-date_object1 | |
| #print datefield[1:12] | |
| #print date1 | |
| #print date2 | |
| #print date3 | |
| print bizkey | |
| print date_delta | |
| def days_between_events(filename, numberdays): | |
| business_dict = collections.defaultdict(list) | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| dates = yelpfield[0] | |
| print yelpfield | |
| #business_dict[businessid].append(dates) | |
| #date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| #for datefield in date_clusters | |
| #datevalues = datefield[1] | |
| #print datevalues | |
| #date_delta = datetime.strptime(dates[0:10],('%Y-%m-%d'))-datetime.strptime(dates[0:10],('%Y-%m-%d')) | |
| #print date_delta | |
| #date_object = datetime.strptime(dates, ('%Y-%m-%d')) | |
| #first_date = date_object[0] | |
| #second_date = date_object[2] | |
| #first_date-second_date | |
| #print date_object | |
| #print datetime.strptime(date_object[0],'%Y-%m-%d') - datetime.strptime(date_object[-1],'%Y-%m-%d') | |
| #print date_object[0].day-date_object[-1].day | |
| #print dates + businessid | |
| #for businessid, dates in business_dict.iteritems(): | |
| #business_dict[businessid]=sorted(dates) | |
| #{businessid:dates for businessid,dates in business_dict.iteritems()if dates[0].day-dates[-1].day<2} | |
| #date_object =datetime.strptime(dates, '(%Y[-/]%m[-/]%d)' | |
| #print date_object | |
| #print result | |
| # | |
| #if __name__ == '__main__': | |
Author
It looks like dates[-1] is returning the string '2' instead of what you expect.
First, get the import statements out of the functions and use a different variable name than tuple.
Second, debug step by step, making absolutely sure each line is doing what you expect. You can use unit tests (http://docs.python.org/2/library/unittest.html) if you feel like "doing it right", or just throw print statements everwhere, which is what many people do.
This probably doesn't work as-is, but it's moving in the direction of simpler functions that are easier to debug.
from collections import defaultdict
def parse_date(datestring):
return datetime.strptime(datestring, '%Y-%m-%d')
def parse_events(filename):
openfile = open(filename, "rU")
regex = r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \".{22}\"\})'
event_tuples = re.findall(regex, openfile.read())
return event_tuples
def key_events_on_biz(event_tuples):
event_dictionary = defaultdict(list)
for fields in events:
businessid = fields[2]
date = field[0]
event_dictionary[businessid].append(date)
return event_dictionary
def diff_dates(event_dictionary):
differences = {}
for biz, datelist in event_dictionary.iteritems():
first = parse_date(datelist[0])
last = parse_date(datelist[-1])
delta = last - first
differences[biz] = delta
return differences
if __name__ == "__main__":
event_tuples = parse_events(filename)
event_dictionary = key_events_on_biz(event_tuples)
differences = diff_dates(event_dictionary)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Okay now I get a new error:
Traceback (most recent call last):
File "<pyshell#68>", line 1, in
yelpclusterfinder.days_between_events('C:\Python27\TRUNCATED.txt', 1)
File "yelpclusterfinder.py", line 56, in days_between_events
date_delta = datetime.strptime(dates[0],('%Y-%m-%d'))-datetime.strptime(dates[-1],('%Y-%m-%d'))
File "C:\Python27\Lib_strptime.py", line 325, in _strptime
(data_string, format))
ValueError: time data '2' does not match format '%Y-%m-%d'
How do I format the 2nd date?