Last active
December 23, 2015 05:09
-
-
Save jschombe/6585250 to your computer and use it in GitHub Desktop.
Find number of days between yelp keyword event
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #yelp cluster finder | |
| import re | |
| import datetime | |
| from datetime import datetime | |
| import collections | |
| from collections import defaultdict | |
| def yelp_find(filename,kw): | |
| open_filename = open (filename, "rU") | |
| kw_list=[] | |
| business_dict={} | |
| for line in open_filename: | |
| if kw in line: | |
| print line | |
| kw_list.append(line) | |
| def yelp_find_busid_and_date(filename,kw): | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall('(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})', open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| business_id = yelpfield[2] | |
| date = yelpfield[0] | |
| print yelpfield[2]+yelpfield[0] | |
| #print result | |
| def yelp_cluster_size(filename, kw): | |
| #"this line filters out all reviews with the defined keyword" | |
| yelpfield_list =[] | |
| business_dict = collections.defaultdict(list) | |
| #yelpfield_list.append(business_string) | |
| #for line in filename: | |
| #if keyword in line: | |
| #keyword_list.append(line) | |
| #print line | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| date = yelpfield[0] | |
| business_dict[businessid].append(date) | |
| business_string = str (business_dict) | |
| date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| cluster_string = str (date_clusters) | |
| Filtered_bizid_and_dates = re.findall(r'(.+)(\"business_id\"\: \"......................\"\})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\d{4}[-/]\d{2}[-/]\d{2})(.+)', (cluster_string)) | |
| for datefield in Filtered_bizid_and_dates: | |
| bizkey = datefield[1] | |
| date1 =datefield[3] | |
| date2 =datefield[5] | |
| date_object1 = datetime.strptime(date1, ('%Y-%m-%d')) | |
| date_object2 = datetime.strptime(date2, ('%Y-%m-%d')) | |
| date_delta =date_object2-date_object1 | |
| #print datefield[1:12] | |
| #print date1 | |
| #print date2 | |
| #print date3 | |
| print bizkey | |
| print date_delta | |
| def days_between_events(filename, numberdays): | |
| business_dict = collections.defaultdict(list) | |
| open_filename = open (filename, "rU") | |
| yelp_business_id_and_dates = re.findall(r'(\d{4}[-/]\d{2}[-/]\d{2})(.+)(\"business_id\"\: \"......................\"\})',open_filename.read()) | |
| for yelpfield in yelp_business_id_and_dates: | |
| businessid = yelpfield[2] | |
| dates = yelpfield[0] | |
| print yelpfield | |
| #business_dict[businessid].append(dates) | |
| #date_clusters = dict([k,v] for k,v in business_dict.items() if len(v) > 1) | |
| #for datefield in date_clusters | |
| #datevalues = datefield[1] | |
| #print datevalues | |
| #date_delta = datetime.strptime(dates[0:10],('%Y-%m-%d'))-datetime.strptime(dates[0:10],('%Y-%m-%d')) | |
| #print date_delta | |
| #date_object = datetime.strptime(dates, ('%Y-%m-%d')) | |
| #first_date = date_object[0] | |
| #second_date = date_object[2] | |
| #first_date-second_date | |
| #print date_object | |
| #print datetime.strptime(date_object[0],'%Y-%m-%d') - datetime.strptime(date_object[-1],'%Y-%m-%d') | |
| #print date_object[0].day-date_object[-1].day | |
| #print dates + businessid | |
| #for businessid, dates in business_dict.iteritems(): | |
| #business_dict[businessid]=sorted(dates) | |
| #{businessid:dates for businessid,dates in business_dict.iteritems()if dates[0].day-dates[-1].day<2} | |
| #date_object =datetime.strptime(dates, '(%Y[-/]%m[-/]%d)' | |
| #print date_object | |
| #print result | |
| # | |
| #if __name__ == '__main__': | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This probably doesn't work as-is, but it's moving in the direction of simpler functions that are easier to debug.