Last active
May 27, 2021 22:37
-
-
Save aeaia/c73b6f8b6f90626a88b03f0a7db9ede4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import csv | |
| # select file to parse | |
| list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/' | |
| csv_name = 'hackday_websites_to_parse.csv' | |
| csv_path = list_directory + csv_name | |
| #create list for imported data | |
| l = [] | |
| # import the file into a nested list | |
| with open(csv_path, 'rb') as csvfile: | |
| file_reader = csv.reader(csvfile) | |
| for row in file_reader: | |
| l.append(row) | |
| #flatten nested list | |
| websites = [item[2] for item in l] | |
| #create list for found urls | |
| found_urls = [] | |
| #parse urls with some regex found on the internet | |
| for i in websites: | |
| # replace the %20s with spaces to reduce false positives | |
| found_urls.append(re.findall('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower())))) | |
| #flatten nested list | |
| all_urls = [item for sublist in found_urls for item in sublist] | |
| #get rid of some low-hanging fruit | |
| # too short | |
| def is_too_short(url): | |
| #7 because don't want to miss out on stuff like rec.gov | |
| #if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us | |
| return len(url) < 7 | |
| #test cases | |
| # is_too_short(all_urls[100]) | |
| # is_too_short(all_urls[0]) | |
| # no letters | |
| #they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance | |
| def has_no_letters(url): | |
| return len(re.findall('[a-zA-Z]', url)) == 0 | |
| #test cases | |
| # has_no_letters(all_urls[100]) | |
| # has_no_letters(all_urls[0]) | |
| # has_useless_domain | |
| # facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc. | |
| useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com', | |
| 'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com', | |
| 'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com', | |
| 'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil', | |
| 'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com'] | |
| def has_useless_domain(url): | |
| useless_domain_detected = False | |
| for i in useless_domains: | |
| if len(re.findall(i, url)) > 0: | |
| useless_domain_detected = True | |
| return useless_domain_detected | |
| #test cases | |
| # has_useless_domain(all_urls[175]) | |
| # has_useless_domain(all_urls[0]) | |
| # other filters to add later | |
| #def has_bad_starting_character(url): | |
| #starts with a negative | |
| #starts with a number | |
| # | |
| #def has_erroneous_punctuation(url): | |
| #2 dots in a row | |
| #def is_probably_an_image(url): | |
| #.gif, .jped | |
| def bad_url(url): | |
| return is_too_short(url) or has_no_letters(url) or has_useless_domain(url) | |
| #test cases | |
| # bad_url(all_urls[0]) | |
| # bad_url(all_urls[1]) | |
| # bad_url(all_urls[175]) | |
| # bad_url(all_urls[199]) | |
| #create list for filtered urls | |
| filtered_urls = [] | |
| for i in all_urls: | |
| if not bad_url(i): | |
| filtered_urls.append(i) | |
| #dedupe | |
| urls_to_test = list(dict.fromkeys(filtered_urls)) | |
| #this narrowd it down by ~43% --> actually more after domain pruning | |
| #see if these places are in RT | |
| # theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate | |
| # additionally, many of these urls were just one of many 20 urls listed in one place's website attribute | |
| # if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids | |
| # if yes, offer it up as a potential new url | |
| # if no, this is a potential duplicated listing | |
| # if the url/slug does not have a match in Places, it could be a hidden gem | |
| #generate slugs | |
| #clean up urls | |
| def remove_leading_http(url): | |
| return re.sub('^https*', '', url) | |
| def remove_leading_colon(url): | |
| return re.sub('^:', '', url) | |
| def remove_leading_slashes(url): | |
| return re.sub('^/+', '', url) | |
| def remove_leading_www(url): | |
| return re.sub('^www\.', '', url) | |
| def remove_prefixes(url): | |
| return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url)))) | |
| def find_domain_and_tld(url): | |
| return re.sub('/.*', '', url) | |
| #find most common domains | |
| domains = [] | |
| for i in urls_to_test: | |
| domains.append(find_domain_and_tld(remove_prefixes(i))) | |
| import collections | |
| counter = collections.Counter(domains) | |
| print(counter.most_common) | |
| #these are common enough that they probably have a specific url formula that's worth processing | |
| slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com'] | |
| # collect the urls of these domains to figure out which ones may need special treatment | |
| domain_urls = [] | |
| for i in urls_to_test: | |
| if find_domain_and_tld(remove_prefixes(i)) in slug_domains: | |
| domain_urls.append(remove_prefixes(i)) | |
| # come up with the special treatment | |
| def foursquare_slug(url_noprefix): | |
| if re.search('foursquare\.com/v/', url_noprefix): | |
| return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix)) | |
| def yelp_slug(url_noprefix): | |
| if re.search('yelp\.com/biz/', url_noprefix): | |
| return re.sub('yelp\.com/biz/', '', url_noprefix) | |
| def dps_slug(url_noprefix): | |
| if re.search('dps-siteplatform\.com/', url_noprefix): | |
| return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix)) | |
| def urbanspoon_slug(url_noprefix): | |
| if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix): | |
| return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix)) | |
| def tripadvisor_slug(url_noprefix): | |
| if re.search('tripadvisor\.com/.+-reviews-', url_noprefix): | |
| return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix)) | |
| def yahoo_slug(url_noprefix): | |
| if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix): | |
| return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix) | |
| def generic_slug(url_noprefix): | |
| return re.sub('\.[a-zA-Z]+$', '', url_noprefix) | |
| def url_to_slug(url): | |
| domain = find_domain_and_tld(remove_prefixes(url)) | |
| slug = None | |
| if domain in slug_domains: | |
| if domain == 'foursquare.com': | |
| slug = foursquare_slug(remove_prefixes(url)) | |
| if domain == 'yelp.com': | |
| slug = yelp_slug(remove_prefixes(url)) | |
| if domain == 'dps-siteplatform.com': | |
| slug = dps_slug(remove_prefixes(url)) | |
| if domain == 'urbanspoon.com': | |
| slug = urbanspoon_slug(remove_prefixes(url)) | |
| if domain == 'tripadvisor.com': | |
| slug = tripadvisor_slug(remove_prefixes(url)) | |
| if domain == 'local.yahoo.com': | |
| slug = yahoo_slug(remove_prefixes(url)) | |
| else: | |
| slug = generic_slug(domain) | |
| if slug: | |
| slug = re.sub('/', '-', re.sub('\.', '-', slug)) | |
| return slug | |
| #select diretory for autocomplete results | |
| autocomplete_directory = list_directory + 'autocomplete/' | |
| import requests | |
| import json | |
| import hmac | |
| import hashlib | |
| import httplib | |
| # initialize a list for api keys | |
| s = [] | |
| # grab those keys | |
| with open('/Users/ali/.pythonsecrets', 'rb') as secrets: | |
| secretreader = csv.reader(secrets) | |
| for row in secrets: | |
| s.append(row.replace('\n', '')) | |
| # create parameters to be used to connect | |
| # keys | |
| clientId = s[0] | |
| secretKey = s[1] | |
| def autocomplete (slug): | |
| # connection | |
| conn = httplib.HTTPSConnection('api.boone.ai') | |
| # | |
| # endpoint | |
| endpoint_start = '/api/v2/autocomplete?input=' | |
| endpoint_end = '&locations=country:US|country:CA' | |
| endpoint = endpoint_start + slug + endpoint_end | |
| # | |
| signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest() | |
| headers = { "Content-Type" : "application/json", | |
| "RT-ORG-APP-CLIENT-ID" : clientId, | |
| "RT-ORG-APP-HMAC": signature | |
| } | |
| conn.request("GET", endpoint, None, headers) | |
| response = conn.getresponse() | |
| return response | |
| summary_file = list_directory + 'summary.csv' | |
| outfile = open(summary_file, 'w') | |
| csvwriter = csv.writer(outfile) | |
| header = ['url', 'slug', 'canonical_place_id'] | |
| csvwriter.writerow(header) | |
| n=1 | |
| autocomplete_successes = [] | |
| autocomplete_failures = [] | |
| for i in urls_to_test: | |
| print i | |
| slug = url_to_slug(i) | |
| line = [] | |
| if slug: | |
| response = autocomplete(slug) | |
| if response.status == 200: | |
| response = json.loads(response.read()) | |
| if response["data"] != []: | |
| line.append(i) | |
| line.append(slug) | |
| line.append(response["data"][0]["properties"]["id"]) #canonical_place_id | |
| autocomplete_successes.append(i) | |
| else: | |
| autocomplete_failures.append(i) | |
| csvwriter.writerow(line) | |
| outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment