-
-
Save scrapehero/bc34513e2ea72dc0890ad47fbd8a1a4f to your computer and use it in GitHub Desktop.
| import json | |
| import requests | |
| from lxml import html | |
| from collections import OrderedDict | |
| import argparse | |
| def parse(source,destination,date): | |
| for i in range(5): | |
| try: | |
| url = "https://www.expedia.com/Flights-Search?trip=oneway&leg1=from:{0},to:{1},departure:{2}TANYT&passengers=adults:1,children:0,seniors:0,infantinlap:Y&options=cabinclass%3Aeconomy&mode=search&origref=www.expedia.com".format(source,destination,date) | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} | |
| response = requests.get(url, headers=headers, verify=False) | |
| parser = html.fromstring(response.text) | |
| json_data_xpath = parser.xpath("//script[@id='cachedResultsJson']//text()") | |
| raw_json =json.loads(json_data_xpath[0] if json_data_xpath else '') | |
| flight_data = json.loads(raw_json["content"]) | |
| flight_info = OrderedDict() | |
| lists=[] | |
| for i in flight_data['legs'].keys(): | |
| total_distance = flight_data['legs'][i].get("formattedDistance",'') | |
| exact_price = flight_data['legs'][i].get('price',{}).get('totalPriceAsDecimal','') | |
| departure_location_airport = flight_data['legs'][i].get('departureLocation',{}).get('airportLongName','') | |
| departure_location_city = flight_data['legs'][i].get('departureLocation',{}).get('airportCity','') | |
| departure_location_airport_code = flight_data['legs'][i].get('departureLocation',{}).get('airportCode','') | |
| arrival_location_airport = flight_data['legs'][i].get('arrivalLocation',{}).get('airportLongName','') | |
| arrival_location_airport_code = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCode','') | |
| arrival_location_city = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCity','') | |
| airline_name = flight_data['legs'][i].get('carrierSummary',{}).get('airlineName','') | |
| no_of_stops = flight_data['legs'][i].get("stops","") | |
| flight_duration = flight_data['legs'][i].get('duration',{}) | |
| flight_hour = flight_duration.get('hours','') | |
| flight_minutes = flight_duration.get('minutes','') | |
| flight_days = flight_duration.get('numOfDays','') | |
| if no_of_stops==0: | |
| stop = "Nonstop" | |
| else: | |
| stop = str(no_of_stops)+' Stop' | |
| total_flight_duration = "{0} days {1} hours {2} minutes".format(flight_days,flight_hour,flight_minutes) | |
| departure = departure_location_airport+", "+departure_location_city | |
| arrival = arrival_location_airport+", "+arrival_location_city | |
| carrier = flight_data['legs'][i].get('timeline',[])[0].get('carrier',{}) | |
| plane = carrier.get('plane','') | |
| plane_code = carrier.get('planeCode','') | |
| formatted_price = "{0:.2f}".format(exact_price) | |
| if not airline_name: | |
| airline_name = carrier.get('operatedBy','') | |
| timings = [] | |
| for timeline in flight_data['legs'][i].get('timeline',{}): | |
| if 'departureAirport' in timeline.keys(): | |
| departure_airport = timeline['departureAirport'].get('longName','') | |
| departure_time = timeline['departureTime'].get('time','') | |
| arrival_airport = timeline.get('arrivalAirport',{}).get('longName','') | |
| arrival_time = timeline.get('arrivalTime',{}).get('time','') | |
| flight_timing = { | |
| 'departure_airport':departure_airport, | |
| 'departure_time':departure_time, | |
| 'arrival_airport':arrival_airport, | |
| 'arrival_time':arrival_time | |
| } | |
| timings.append(flight_timing) | |
| flight_info={'stops':stop, | |
| 'ticket price':formatted_price, | |
| 'departure':departure, | |
| 'arrival':arrival, | |
| 'flight duration':total_flight_duration, | |
| 'airline':airline_name, | |
| 'plane':plane, | |
| 'timings':timings, | |
| 'plane code':plane_code | |
| } | |
| lists.append(flight_info) | |
| sortedlist = sorted(lists, key=lambda k: k['ticket price'],reverse=False) | |
| return sortedlist | |
| except ValueError: | |
| print ("Rerying...") | |
| return {"error":"failed to process the page",} | |
| if __name__=="__main__": | |
| argparser = argparse.ArgumentParser() | |
| argparser.add_argument('source',help = 'Source airport code') | |
| argparser.add_argument('destination',help = 'Destination airport code') | |
| argparser.add_argument('date',help = 'MM/DD/YYYY') | |
| args = argparser.parse_args() | |
| source = args.source | |
| destination = args.destination | |
| date = args.date | |
| print ("Fetching flight details") | |
| scraped_data = parse(source,destination,date) | |
| print ("Writing data to output file") | |
| with open('%s-%s-flight-results.json'%(source,destination),'w') as fp: | |
| json.dump(scraped_data,fp,indent = 4) |
Hi, thank you for your code. I just found that the flights scraped form the xml is different from the actual website for example: PVG-LAX on 05/01/2019. There are 7 none-stop flights on the web but I can only got the first 2. Do you have any idea how this happened?
@pisaller - make sure that you're comparing "apples to apples", i.e. the website by default is using following values while performing search:
passengers=adults:1,children:0,seniors:0,infantinlap:Y
options=cabinclass:economy,nopenalty:N
while the script is using:
passengers=adults:1,children:0,seniors:0,infantinlap:Y
options=cabinclass:economy
Hi,
The code works well, but doesn't srape data for all the flights that the website shows. For instance, if I run the following command:
python expedia.py nyc mia 11/08/2019
the last result the scraper returns is a delta flight (worth $203 at the time of writing). The scripts generates the following URL based on the inputs:
As you can see, the website has twice the number of flights returned by the script. Am I missing something here? I would like to get data for all flights on a particular route.
Thanks for your help.
I made a go version of this here https://github.com/4d55397500/hack-flight-search
does not work anymore, the initial search url returns no json data somhow
Same here too. No json data returned. Is there any workaround?
Hi,
I am getting these errors.. is it because I am running Python 3.6.3?