diff options
author | Ilya Zverev <ilya@zverev.info> | 2016-05-31 12:49:59 +0300 |
---|---|---|
committer | Ilya Zverev <ilya@zverev.info> | 2016-05-31 12:49:59 +0300 |
commit | 6022f55ab630637453fd391effcf0cad6f5ae6bb (patch) | |
tree | 8e8b3d4702e74f3616c81613592f7108f63fffc3 /tools | |
parent | fb768dcab81653470e0ca4dfde6c650bfa8c487e (diff) | |
parent | ff1fc714c7e4b47c224113158a591f3db2e46656 (diff) |
Merge pull request #3302 from syershov/MAPSME-1232
[booking] Process data from booking.com
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/python/booking_hotels.py | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py new file mode 100755 index 0000000000..a0ba6d3f94 --- /dev/null +++ b/tools/python/booking_hotels.py @@ -0,0 +1,204 @@ +#!/usr/bin/python +# coding: utf8 +from __future__ import print_function + +from collections import namedtuple, defaultdict +from datetime import datetime +import argparse +import base64 +import json +import logging +import os +import pickle +import time +import urllib2 + +# init logging +logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') + +Hotel = namedtuple('Hotel', + ['id', 'lat', 'lon', 'name', 'address', + 'stars', 'priceCategory', 'ratingBooking', + 'ratingUser', 'descUrl']) + +class BookingApi: + def __init__(self, login, password): + self.login = login + self.password = password + self.baseConfig = { + "headers": { + "Content-Type": "application/json", + "Authorization": "Basic " + base64.encodestring( + "{login}:{password}".format(login=self.login, password=self.password)).replace('\n', '') + }, + "url": 'https://distribution-xml.booking.com/json/bookings'} + self.checkMinute = 0 + self.requestPerMinute = 0 + self.requestLimit = 15 # request per minute + + def call(self, function, params=None): + self.requestPerMinute += 1 + now = datetime.utcnow() + + if self.requestPerMinute >= self.requestLimit: + waittime = 60 - now.second + logging.warning("Limit for request per minute exceeded. Waiting for: {0} sec.".format(waittime)) + time.sleep(waittime) + now = datetime.utcnow() + + if self.checkMinute != now.minute: + self.requestPerMinute = 0 + self.checkMinute = now.minute + + payload = '' + try: + p = "" if not params else '?' + "&".join( + ["{key}={value}".format(key=k, value=v) for (k, v) in params.iteritems()]) + url = "{base}.{func}{params}".format(base=self.baseConfig["url"], func=function, params=p) + logging.debug("{0} {1} API call:{2}".format(self.checkMinute, self.requestPerMinute, url)) + request = urllib2.Request(url, None, self.baseConfig["headers"]) + stream = urllib2.urlopen(request) + payload = stream.read() + data = json.loads(payload) + if isinstance(data, dict) and 'ruid' in data: + logging.error('Api call failed with error: {0} Code: {1}'.format(data['message'], data['code'])) + return None + return data + + except Exception as e: + logging.error('Error: {0} Context: {1}'.format(e, payload)) + return None + + +def make_record(src, rate): + return Hotel( + unicode(src['hotel_id']), + unicode(src['location']['latitude']), + unicode(src['location']['longitude']), + unicode(src['name']), + unicode(src['address']), + unicode(src['class']), + unicode(rate), + unicode(src['ranking']), + unicode(src['review_score']), + unicode(src['url']) + ) + + +def download(user, password, path): + ''' + Downloads all hotels from booking.com and stores them in a bunch of .pkl files. + ''' + api = BookingApi(user, password) + + maxrows = 1000 + countries = api.call("getCountries", dict(languagecodes='en')) + for country in countries: + countrycode = country['countrycode'] + logging.info(u'Download[{0}]: {1}'.format(countrycode, country['name'])) + + allhotels = [] + while True: + hotels = api.call('getHotels', + dict(new_hotel_type=1, offset=len(allhotels), rows=maxrows, countrycodes=countrycode)) + + # Check for error. + if not hotels: + exit(1) + + allhotels.append(hotels) + + # If hotels in answer less then maxrows, we reach end of data. + if len(hotels) < maxrows: + break + + logging.info('Num of hotels: {0}'.format(len(allhotels))) + filename = os.path.join(path, + '{0} - {1}.pkl'.format(country['area'].encode('utf8'), country['name'].encode('utf8'))) + with open(filename, 'wb') as fd: + pickle.dump(allhotels, fd, pickle.HIGHEST_PROTOCOL) + + +def translate(source, output): + ''' + Reads *.pkl files and produces a single list of hotels as tab separated values. + ''' + files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')] + + data = [] + for filename in files: + logging.info('Processing {0}'.format(filename)) + with open(filename, 'rb') as fd: + data += pickle.load(fd) + + # Dict of dicts city_id -> { currency -> [prices] } + cities = defaultdict(lambda: defaultdict(list)) + + def valid(hotel): + return 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None + + # Collect prices + for hotel in data: + if valid(hotel): + cities[hotel['city_id']][hotel['currencycode']].append(float(hotel['minrate'])) + + # Replaces list of prices by a median price. + for city in cities: + for cur in cities[city]: + cities[city][cur] = sorted(cities[city][cur])[len(cities[city][cur]) / 2] + + # Price rate ranges, relative to the median price for a city + rates = (0.7, 1.3) + + with open(output, 'w') as fd: + for hotel in data: + rate = 0 + if valid(hotel): + avg = cities[hotel['city_id']][hotel['currencycode']] + price = float(hotel['minrate']) + rate = 1 + # Find a range that contains the price + while rate <= len(rates) and price > avg * rates[rate - 1]: + rate += 1 + cur = make_record(hotel, rate) + l = [e.encode('utf8') for e in cur] + print('\t'.join(l), file=fd) + + +def process_options(): + parser = argparse.ArgumentParser(description='Download and process booking hotels.') + parser.add_argument("-v", "--verbose", action="store_true", dest="verbose") + parser.add_argument("-q", "--quiet", action="store_false", dest="verbose") + + parser.add_argument("--password", dest="password", help="Booking.com account password") + parser.add_argument("--user", dest="user", help="Booking.com account user name") + + parser.add_argument("--path", dest="path", help="Path to data files") + parser.add_argument("--output", dest="output", help="Name and destination for output file") + + parser.add_argument("--download", action="store_true", dest="download", default=False) + parser.add_argument("--translate", action="store_true", dest="translate", default=False) + + options = parser.parse_args() + + if not options.download and not options.translate: + parser.print_help() + + if options.translate and not options.output: + print("--output isn't set") + parser.print_help() + exit() + + return options + + +def main(): + options = process_options() + if options.download: + download(options.user, options.password, options.path) + if options.translate: + translate(options.path, options.output) + + +if __name__ == "__main__": + main() |