Skip to content

Instantly share code, notes, and snippets.

@jkopelioff
Created January 13, 2016 05:43
Show Gist options
  • Select an option

  • Save jkopelioff/15191cf5b9fd0ad5736a to your computer and use it in GitHub Desktop.

Select an option

Save jkopelioff/15191cf5b9fd0ad5736a to your computer and use it in GitHub Desktop.
<?php
include('../predis/lib/Predis/Autoloader.php');
$m = new MongoClient(); // connect
$collection = $m->opencorkage->selectCollection("restaurants.losangeles");
$base_url = 'http://www.opentable.com/';
$simple_info = 'httphandlers/RestaurantInfoLiteNew.ashx?rid=';
$restaurant_listings = 'los-angeles-restaurant-listings';
Predis\Autoloader::register();
$redis = new Predis\Client();
$page = $base_url.$restaurant_listings;
echo "Retrieving... ".$page;
$html_text = file_get_contents($page);
$dom = @DOMDocument::loadHTML($html_text);
$xpath = new DomXPath($dom);
$nodes = $xpath->query("//div[contains(normalize-space(@class),'rinfo')]");
$count_total = count($nodes);
$count = 1;
foreach( $nodes as $node)
{
$dom_name_url = $xpath->query("a[contains(normalize-space(@class),'r')]", $node);
$name = $dom_name_url->item(0)->nodeValue;
$rid = $node->getAttribute('rid');
$href = $dom_name_url->item(0)->getAttribute('href');
$simple_page = $base_url.$href;
$html_text = $redis->get($simple_page);
$throttle = true;
if (empty($html_text))
{
echo "Retrieving ".$count." of ".$count_total."... ".$simple_page."\n";
$html_text = file_get_contents($simple_page);
$redis->set($simple_page, $html_text);
} else {
echo "From Cache ".$count." of ".$count_total."... ".$simple_page."\n";
$throttle = false;
}
$dom_s = @DOMDocument::loadHTML($html_text);
$element = $dom_s->getElementById('ProfileOverview_linkRestaurantImage');
$element = ($element)?$element->getElementsByTagName('img'):null;
$image = ($element)?$element->item(0)->getAttribute('src'):"";
$element = $dom_s->getElementById('ProfileOverview_DiningStyle');
$element = ($element)?$element->getElementsByTagName('span'):null;
$dining_style = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_Website');
$element = ($element)?$element->getElementsByTagName('a'):null;
$website = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_Phone');
$element = ($element)?$element->getElementsByTagName('span'):null;
$phone = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_HoursOfOperation');
$element = ($element)?$element->getElementsByTagName('span'):null;
$hours = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_Payment');
$element = ($element)?$element->getElementsByTagName('span'):null;
$payment = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_AllCuisines');
$element = ($element)?$element->getElementsByTagName('span'):null;
$cuisine = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_DressCode');
$element = ($element)?$element->getElementsByTagName('span'):null;
$dress_code = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_AcceptsWalkins');
$element = ($element)?$element->getElementsByTagName('span'):null;
$walkins = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_Offers');
$element = ($element)?$element->getElementsByTagName('span'):null;
$offers = ($element)?$element->item(0)->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_lblPriceText');
$price = ($element)?$element->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_lblAddressText');
$address = ($element)?$element->nodeValue:"";
$element = $dom_s->getElementById('ProfileOverview_lblNeighborhoodText');
$neighborhood = ($element)?$element->nodeValue:"";
$element = $dom_s->getElementById('DescriptionBlock');
$description = ($element)?$element->nodeValue:"";
$restaurant = array("name" => $name,
"description" => preg_replace('/<[^>]*>/', "", $description),
"rid" => $rid,
"ot_url" => $simple_page,
"image" => $base_url.$image,
"dining_style" => $dining_style,
"website" => $website,
"phone" => $phone,
"hours" => $hours,
"payment" => $payment,
"cuisine" => $cuisine,
"dress_code" => $dress_code,
"walkins" => $walkins,
"offers" => $offers,
"price" => $price,
"address" => $address,
"neighborhood" => $neighborhood);
$collection->insert($restaurant);
if ($throttle) {
$rand_num = rand(1,10);
echo "Sleeping for ".$rand_num." sec\n";
//Sleep random interval so opentable doesn't trip
sleep($rand_num);
}
$count++;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment