aeaia · May 27, 2021 22:37
diff --git a/hackday_long_urls.py b/hackday_long_urls.py
 import re
 import csv

 # select file to parse
 list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/'
 csv_name = 'hackday_websites_to_parse.csv'
 csv_path = list_directory + csv_name

 #create list for imported data
 l = []

 # import the file into a nested list
 with open(csv_path, 'rb') as csvfile:
 	file_reader = csv.reader(csvfile)
 	for row in file_reader:
 		l.append(row)

 #flatten nested list
 websites = [item[2] for item in l]

 #create list for found urls
 found_urls = []

 #parse urls with some regex found on the internet
 for i in websites:
 	# replace the %20s with spaces to reduce false positives
 	found_urls.append(re.findall('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower()))))


 #flatten nested list
 all_urls = [item for sublist in found_urls for item in sublist]


 #get rid of some low-hanging fruit

 # too short
 def is_too_short(url):
 	#7 because don't want to miss out on stuff like rec.gov
 	#if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us
 	return len(url) < 7

 #test cases
 # is_too_short(all_urls[100])
 # is_too_short(all_urls[0])


 # no letters
 #they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance
 def has_no_letters(url):
 	return len(re.findall('[a-zA-Z]', url)) == 0

 #test cases
 # has_no_letters(all_urls[100])
 # has_no_letters(all_urls[0])


 # has_useless_domain
 # facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc.
 useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com', 
 				   'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com', 
 				   'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com',
 				   'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil', 
 				   'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com']

 def has_useless_domain(url):
 	useless_domain_detected = False
 	for i in useless_domains:
 		if len(re.findall(i, url)) > 0:
 			useless_domain_detected = True
 	return useless_domain_detected

 #test cases
 # has_useless_domain(all_urls[175])
 # has_useless_domain(all_urls[0])


 # other filters to add later
 #def has_bad_starting_character(url):
 	#starts with a negative
 	#starts with a number
 	#

 #def has_erroneous_punctuation(url):
 	#2 dots in a row

 #def is_probably_an_image(url):
 	#.gif, .jped

 def bad_url(url):
 	return is_too_short(url) or has_no_letters(url) or has_useless_domain(url)

 #test cases
 # bad_url(all_urls[0])
 # bad_url(all_urls[1])
 # bad_url(all_urls[175])
 # bad_url(all_urls[199])

 #create list for filtered urls
 filtered_urls = []

 for i in all_urls:
 	if not bad_url(i):
 		filtered_urls.append(i)

 #dedupe
 urls_to_test = list(dict.fromkeys(filtered_urls))

 #this narrowd it down by ~43% --> actually more after domain pruning

 #see if these places are in RT
 # theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate
 # additionally, many of these urls were just one of many 20 urls listed in one place's website attribute

 # if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids
 # 	if yes, offer it up as a potential new url
 # 	if no, this is a potential duplicated listing
 # if the url/slug does not have a match in Places, it could be a hidden gem

 #generate slugs

 #clean up urls
 def remove_leading_http(url):
 	return re.sub('^https*', '', url)

 def remove_leading_colon(url):
 	return re.sub('^:', '', url)

 def remove_leading_slashes(url):
 	return re.sub('^/+', '', url)

 def remove_leading_www(url):
 	return re.sub('^www\.', '', url)

 def remove_prefixes(url):
 	return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url))))

 def find_domain_and_tld(url):
 	return re.sub('/.*', '', url)

 #find most common domains
 domains = []
 for i in urls_to_test:
 	domains.append(find_domain_and_tld(remove_prefixes(i)))

 import collections
 counter = collections.Counter(domains)
 print(counter.most_common)

 #these are common enough that they probably have a specific url formula that's worth processing
 slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com']

 # collect the urls of these domains to figure out which ones may need special treatment
 domain_urls = []

 for i in urls_to_test:
 	if find_domain_and_tld(remove_prefixes(i)) in slug_domains:
 		domain_urls.append(remove_prefixes(i))

 # come up with the special treatment
 def foursquare_slug(url_noprefix):
 	if re.search('foursquare\.com/v/', url_noprefix):
 		return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix))

 def yelp_slug(url_noprefix):
 	if re.search('yelp\.com/biz/', url_noprefix):
 		return re.sub('yelp\.com/biz/', '', url_noprefix)

 def dps_slug(url_noprefix):
 	if re.search('dps-siteplatform\.com/', url_noprefix):
 		return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix))

 def urbanspoon_slug(url_noprefix):
 	if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix):
 		return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix))

 def tripadvisor_slug(url_noprefix):
 	if re.search('tripadvisor\.com/.+-reviews-', url_noprefix):
 		return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix))

 def yahoo_slug(url_noprefix):
 	if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix):
 		return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix)

 def generic_slug(url_noprefix):
 	return re.sub('\.[a-zA-Z]+$', '', url_noprefix)

 def url_to_slug(url):
 	domain = find_domain_and_tld(remove_prefixes(url))
 	slug = None
 	if domain in slug_domains:
 		if domain == 'foursquare.com':
 			slug = foursquare_slug(remove_prefixes(url))
 		if domain == 'yelp.com':
 			slug = yelp_slug(remove_prefixes(url))
 		if domain == 'dps-siteplatform.com':
 			slug = dps_slug(remove_prefixes(url))
 		if domain == 'urbanspoon.com':
 			slug = urbanspoon_slug(remove_prefixes(url))
 		if domain == 'tripadvisor.com':
 			slug = tripadvisor_slug(remove_prefixes(url))
 		if domain == 'local.yahoo.com':
 			slug = yahoo_slug(remove_prefixes(url))
 	else:
 		slug = generic_slug(domain)
 	if slug:
 		slug = re.sub('/', '-', re.sub('\.', '-', slug))
 	return slug


 #select diretory for autocomplete results
 autocomplete_directory = list_directory + 'autocomplete/'

 import requests
 import json 
 import hmac
 import hashlib
 import httplib


 # initialize a list for api keys
 s = []

 # grab those keys
 with open('/Users/ali/.pythonsecrets', 'rb') as secrets:
 	secretreader = csv.reader(secrets)
 	for row in secrets:
 		s.append(row.replace('\n', ''))

 # create parameters to be used to connect
 # keys
 clientId = s[0]
 secretKey = s[1]


 def autocomplete (slug):
 	# connection
 	conn = httplib.HTTPSConnection('api.boone.ai')
 	# 
 	# endpoint
 	endpoint_start = '/api/v2/autocomplete?input='
 	endpoint_end = '&locations=country:US|country:CA'
 	endpoint = endpoint_start + slug + endpoint_end
 	#
 	signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest()
 	headers = { "Content-Type" : "application/json",
 				"RT-ORG-APP-CLIENT-ID" : clientId,
 				"RT-ORG-APP-HMAC": signature
 				}
 	conn.request("GET", endpoint, None, headers)
 	response = conn.getresponse()
 	return response

 summary_file = list_directory + 'summary.csv'
 outfile = open(summary_file, 'w')
 csvwriter = csv.writer(outfile)
 header = ['url', 'slug', 'canonical_place_id']
 csvwriter.writerow(header)
 n=1
 autocomplete_successes = []
 autocomplete_failures = []
 for i in urls_to_test:
 	print i
 	slug = url_to_slug(i)
 	line = []
 	if slug:
 		response = autocomplete(slug)
 		if response.status == 200:
 			response = json.loads(response.read())
 			if response["data"] != []:
 				line.append(i)
 				line.append(slug)
 				line.append(response["data"][0]["properties"]["id"]) #canonical_place_id
 				autocomplete_successes.append(i)
 		else:
 			autocomplete_failures.append(i)
 	csvwriter.writerow(line)

 outfile.close()
	import re
	import csv

	# select file to parse
	list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/'
	csv_name = 'hackday_websites_to_parse.csv'
	csv_path = list_directory + csv_name

	#create list for imported data
	l = []

	# import the file into a nested list
	with open(csv_path, 'rb') as csvfile:
	file_reader = csv.reader(csvfile)
	for row in file_reader:
	l.append(row)

	#flatten nested list
	websites = [item[2] for item in l]

	#create list for found urls
	found_urls = []

	#parse urls with some regex found on the internet
	for i in websites:
	# replace the %20s with spaces to reduce false positives
	found_urls.append(re.findall('(?:(?:https?\|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower()))))


	#flatten nested list
	all_urls = [item for sublist in found_urls for item in sublist]


	#get rid of some low-hanging fruit

	# too short
	def is_too_short(url):
	#7 because don't want to miss out on stuff like rec.gov
	#if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us
	return len(url) < 7

	#test cases
	# is_too_short(all_urls[100])
	# is_too_short(all_urls[0])


	# no letters
	#they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance
	def has_no_letters(url):
	return len(re.findall('[a-zA-Z]', url)) == 0

	#test cases
	# has_no_letters(all_urls[100])
	# has_no_letters(all_urls[0])


	# has_useless_domain
	# facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc.
	useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com',
	'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com',
	'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com',
	'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil',
	'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com']

	def has_useless_domain(url):
	useless_domain_detected = False
	for i in useless_domains:
	if len(re.findall(i, url)) > 0:
	useless_domain_detected = True
	return useless_domain_detected

	#test cases
	# has_useless_domain(all_urls[175])
	# has_useless_domain(all_urls[0])


	# other filters to add later
	#def has_bad_starting_character(url):
	#starts with a negative
	#starts with a number
	#

	#def has_erroneous_punctuation(url):
	#2 dots in a row

	#def is_probably_an_image(url):
	#.gif, .jped

	def bad_url(url):
	return is_too_short(url) or has_no_letters(url) or has_useless_domain(url)

	#test cases
	# bad_url(all_urls[0])
	# bad_url(all_urls[1])
	# bad_url(all_urls[175])
	# bad_url(all_urls[199])

	#create list for filtered urls
	filtered_urls = []

	for i in all_urls:
	if not bad_url(i):
	filtered_urls.append(i)

	#dedupe
	urls_to_test = list(dict.fromkeys(filtered_urls))

	#this narrowd it down by ~43% --> actually more after domain pruning

	#see if these places are in RT
	# theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate
	# additionally, many of these urls were just one of many 20 urls listed in one place's website attribute

	# if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids
	# if yes, offer it up as a potential new url
	# if no, this is a potential duplicated listing
	# if the url/slug does not have a match in Places, it could be a hidden gem

	#generate slugs

	#clean up urls
	def remove_leading_http(url):
	return re.sub('^https*', '', url)

	def remove_leading_colon(url):
	return re.sub('^:', '', url)

	def remove_leading_slashes(url):
	return re.sub('^/+', '', url)

	def remove_leading_www(url):
	return re.sub('^www\.', '', url)

	def remove_prefixes(url):
	return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url))))

	def find_domain_and_tld(url):
	return re.sub('/.*', '', url)

	#find most common domains
	domains = []
	for i in urls_to_test:
	domains.append(find_domain_and_tld(remove_prefixes(i)))

	import collections
	counter = collections.Counter(domains)
	print(counter.most_common)

	#these are common enough that they probably have a specific url formula that's worth processing
	slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com']

	# collect the urls of these domains to figure out which ones may need special treatment
	domain_urls = []

	for i in urls_to_test:
	if find_domain_and_tld(remove_prefixes(i)) in slug_domains:
	domain_urls.append(remove_prefixes(i))

	# come up with the special treatment
	def foursquare_slug(url_noprefix):
	if re.search('foursquare\.com/v/', url_noprefix):
	return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix))

	def yelp_slug(url_noprefix):
	if re.search('yelp\.com/biz/', url_noprefix):
	return re.sub('yelp\.com/biz/', '', url_noprefix)

	def dps_slug(url_noprefix):
	if re.search('dps-siteplatform\.com/', url_noprefix):
	return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix))

	def urbanspoon_slug(url_noprefix):
	if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix):
	return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix))

	def tripadvisor_slug(url_noprefix):
	if re.search('tripadvisor\.com/.+-reviews-', url_noprefix):
	return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix))

	def yahoo_slug(url_noprefix):
	if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix):
	return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix)

	def generic_slug(url_noprefix):
	return re.sub('\.[a-zA-Z]+$', '', url_noprefix)

	def url_to_slug(url):
	domain = find_domain_and_tld(remove_prefixes(url))
	slug = None
	if domain in slug_domains:
	if domain == 'foursquare.com':
	slug = foursquare_slug(remove_prefixes(url))
	if domain == 'yelp.com':
	slug = yelp_slug(remove_prefixes(url))
	if domain == 'dps-siteplatform.com':
	slug = dps_slug(remove_prefixes(url))
	if domain == 'urbanspoon.com':
	slug = urbanspoon_slug(remove_prefixes(url))
	if domain == 'tripadvisor.com':
	slug = tripadvisor_slug(remove_prefixes(url))
	if domain == 'local.yahoo.com':
	slug = yahoo_slug(remove_prefixes(url))
	else:
	slug = generic_slug(domain)
	if slug:
	slug = re.sub('/', '-', re.sub('\.', '-', slug))
	return slug


	#select diretory for autocomplete results
	autocomplete_directory = list_directory + 'autocomplete/'

	import requests
	import json
	import hmac
	import hashlib
	import httplib


	# initialize a list for api keys
	s = []

	# grab those keys
	with open('/Users/ali/.pythonsecrets', 'rb') as secrets:
	secretreader = csv.reader(secrets)
	for row in secrets:
	s.append(row.replace('\n', ''))

	# create parameters to be used to connect
	# keys
	clientId = s[0]
	secretKey = s[1]


	def autocomplete (slug):
	# connection
	conn = httplib.HTTPSConnection('api.boone.ai')
	#
	# endpoint
	endpoint_start = '/api/v2/autocomplete?input='
	endpoint_end = '&locations=country:US\|country:CA'
	endpoint = endpoint_start + slug + endpoint_end
	#
	signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest()
	headers = { "Content-Type" : "application/json",
	"RT-ORG-APP-CLIENT-ID" : clientId,
	"RT-ORG-APP-HMAC": signature
	}
	conn.request("GET", endpoint, None, headers)
	response = conn.getresponse()
	return response

	summary_file = list_directory + 'summary.csv'
	outfile = open(summary_file, 'w')
	csvwriter = csv.writer(outfile)
	header = ['url', 'slug', 'canonical_place_id']
	csvwriter.writerow(header)
	n=1
	autocomplete_successes = []
	autocomplete_failures = []
	for i in urls_to_test:
	print i
	slug = url_to_slug(i)
	line = []
	if slug:
	response = autocomplete(slug)
	if response.status == 200:
	response = json.loads(response.read())
	if response["data"] != []:
	line.append(i)
	line.append(slug)
	line.append(response["data"][0]["properties"]["id"]) #canonical_place_id
	autocomplete_successes.append(i)
	else:
	autocomplete_failures.append(i)
	csvwriter.writerow(line)

	outfile.close()
No results found