Last active
December 16, 2015 10:49
-
-
Save XiaoyuShan/5422669 to your computer and use it in GitHub Desktop.
These are different methods run in Main.py to get the right price by finding closest xpath distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def currency_get(price): | |
| currency_map = {'€':'EUR','¥':'JPY',u'\xa3':'GBP','$':'USD','Euro':'EUR'} | |
| currencytail = "EUR|Euro\w|RMB|GBP|JPY|AUD|USD" | |
| pricetem = price.split() | |
| currency = "" | |
| symbol = "" | |
| error = "" | |
| for tem in pricetem: | |
| m1 = re.search('^[0-9\.\,]{3,}$',tem) | |
| m2=re.search('^[A-Za-z]{1,4}$',tem) | |
| m3=re.search('^[€,¥,£,$]{1,2}',tem) | |
| m2=re.search('^(EUR|Euro|RMB|GBP|JPY|AUD|USD)',tem) | |
| m4=re.search('^.[0-9\.\,]{3,}$',tem) | |
| if m1!= None: | |
| price = tem | |
| elif m4 != None: | |
| price =tem[1:] | |
| symbol = tem[:1] | |
| if currency_map.has_key(symbol) == True: | |
| currency = currency_map[symbol] | |
| elif m2 != None: | |
| currency = tem | |
| elif m3!=None: | |
| if currency_map.has_key(tem)== True: | |
| currency = currency_map[tem] | |
| #else: | |
| #print "error" | |
| return currency |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #from bottle import route, run | |
| import lxml.html as HTML | |
| import lxml.etree as etree | |
| import difflib | |
| from difflib import SequenceMatcher | |
| import urllib2 | |
| import cookielib | |
| from pprint import pprint | |
| import re | |
| from price_check import price_check | |
| from price_get import price_get | |
| from xpath_filter import xpath_filter | |
| from currency_get import currency_get | |
| #@route('/xpath/<weburl:path>,<imgurl:path>') | |
| #run (host='localhost',port = 8080, debug = True) | |
| def get_price(weburl): | |
| final_list = [] | |
| keyresults=[] | |
| truexpath = "" | |
| opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) | |
| f = opener.open(weburl) | |
| s = f.read() | |
| #s = unicode(s, 'utf-8') | |
| f.close() | |
| hdoc = HTML.fromstring(s) | |
| htree = etree.ElementTree(hdoc) | |
| #search xpath contain currency symbol | |
| keys = xpath_filter() | |
| for key in keys: | |
| key_locations = htree.xpath("//*[re:match(text(),'"+key+"')]", namespaces={"re": "http://exslt.org/regular-expressions"}) | |
| for Elem_location in key_locations: | |
| keyresults.append(htree.getpath(Elem_location)) | |
| if len(keyresults)!=False: | |
| break | |
| # get price and currency | |
| for result in keyresults: | |
| truexpath = result | |
| final_currency="" | |
| final_price= "" | |
| final_symbol="" | |
| if truexpath.find('script')==-1: | |
| #print truexpath | |
| modxpath = [] | |
| modxpath.append(truexpath[:truexpath.rfind('/')+1]+"*") | |
| modxpath.append(truexpath[:truexpath.rfind('/')]) | |
| temxpath = truexpath[:truexpath.rfind('/')] | |
| modxpath.append(temxpath[:temxpath.rfind('/')+1]+"*") | |
| modxpath.append(temxpath[:temxpath.rfind('/')]) | |
| flag = False | |
| locations = htree.xpath(truexpath) | |
| for Elem_location in locations: | |
| price = Elem_location.text | |
| if price!=None: | |
| final_currency = currency_get(price) | |
| flag = price_check(price) | |
| #print flag | |
| #print price | |
| if flag == False: | |
| if price_get(price)!= "No price" and price_get(price)!= None: | |
| final_price = price_get(price) | |
| else: | |
| break | |
| if flag == True: | |
| for mod in modxpath: | |
| loc = htree.xpath(mod) | |
| for Elem_location in loc: | |
| price = Elem_location.text | |
| price_get(price) | |
| #print price | |
| if price_get(price)!="No price": | |
| final_price = price_get(price) | |
| break | |
| final =[] | |
| #print final_currency | |
| #print final_price | |
| final.append(final_currency) | |
| final.append(str(final_price)) | |
| final.append(truexpath) | |
| final_list.append(final) | |
| #print final_list | |
| return final_list | |
| ''' | |
| test cases: | |
| url="http://www.oipolloi.com/apc-cable-knit-pullover-off-white" | |
| url="http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id=26503334&parentid=M_APP_SHORTSSWIM_SHORTS" | |
| url = "http://www.urbanexcess.com/c-1190-sweatshirts.aspx" | |
| url="http://www.freepeople.com/whats-new-intimates/" | |
| url = "http://www.manufactum.com/sweaters-c193633/" | |
| url = "http://www.etsy.com" | |
| url ="http://www.urbanoutfitters.com/urban/catalog/category.jsp?id=BRANDS&brand=rothco" | |
| url="http://www.zara.com/webapp/wcs/stores/servlet/category/us/en/zara-nam-S2013/358056/Trousers" | |
| url="http://www.zara.com/webapp/wcs/stores/servlet/product/us/en/zara-nam-S2013/358080/1232023/JACKET+WITH+PATCHES" | |
| url="http://www.etsy.com/listing/106835119/micro-stoneware-bowl-white-fine-bone?ref=fp_treasury_1" | |
| get_price(url) | |
| ''' | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys,re,urllib2,cookielib,urllib | |
| from lxml import etree | |
| import lxml.html as HTML | |
| import lxml.etree as etree | |
| import difflib | |
| from pprint import pprint | |
| from difflib import SequenceMatcher | |
| import re | |
| # get image xpath | |
| def xpath_imgrequest(imgurl, weburl): | |
| opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) | |
| f = opener.open(weburl) | |
| s = f.read() | |
| f.close() | |
| hdoc = HTML.fromstring(s) | |
| htree = etree.ElementTree(hdoc) | |
| xpaths=[] | |
| imgresults=[] | |
| xpath_flag = False | |
| xpaths = xpath_search(imgurl,weburl) | |
| for xpath in xpaths: | |
| image_locations = htree.xpath('//img[contains(@src, "'+xpath+'" )]') | |
| if len(str(image_locations))>2: | |
| xpath_flag = True | |
| break | |
| if xpath_flag == False: | |
| for xpath in xpaths: | |
| image_locations = htree.xpath('//img[contains(@data-original, "'+xpath+'" )]') | |
| if len(str(image_locations))>2: | |
| break | |
| for Elem_location in image_locations: | |
| imgresults.append(htree.getpath(Elem_location)) | |
| for result in imgresults: | |
| return result | |
| def xpath_search(imgurl,weburl): | |
| #image_locations = htree.xpath("//*[re:match(text(),'"+imgurl+"')]", namespaces={"re": "http://exslt.org/regular-expressions"}) | |
| imgurls = [] | |
| imgurltem = imgurl | |
| imgurls.append(imgurltem) | |
| imgurltem = imgurl[imgurl.find('.com')+4:] | |
| imgurls.append(imgurltem) | |
| imgurltem = imgurl[:imgurl.rfind('/')+1] | |
| imgurls.append(imgurltem) | |
| imgurltem = imgurl[imgurl.find('http:')+5:] | |
| imgurls.append(imgurltem) | |
| s = difflib.SequenceMatcher(None, imgurl, weburl) | |
| matchfirst = s.get_matching_blocks()[0] | |
| position = matchfirst[2] | |
| imgurltem = imgurl[position-1:] | |
| imgurls.append(imgurltem) | |
| return imgurls | |
| imgurl1="http://www.mrporter.com/images/products/329468/329468_mrp_in_m2.jpg" | |
| weburl1="http://www.mrporter.com/Shop/List/Paul_Smith_British_Classics?cm_sp=homepage-_-paulsmithm4-_-020413" | |
| weburl2="http://www.urbanexcess.com/c-1190-sweatshirts.aspx" | |
| imgurl2="http://www.urbanexcess.com/images/PRODUCT/icon/01526-320_1.jpg" | |
| imgurl3 = "http://cdn.photojojo.net/store/awesomeness/productImages/spring-break-camera-strap-8984_600.0000001362858205.jpg" | |
| weburl3 = "http://photojojo.com/store/awesomeness/spring-break-camera-strap/" | |
| weburl4 = "http://www.oipolloi.com/apc-cable-knit-pullover-off-white" | |
| imgurl4 = "http://www.oipolloi.com/cache/images/gallery/36236-1--450-auto.jpg" | |
| weburl5 = "http://unionmadegoods.com/SPRING_COLLECTIONS_486.html" | |
| #/Universal_Works_Rose_Print_Button_Down_in_Shirt_in_Orange_8749.html?reframed=1 | |
| imgurl5 = "cdn-fsg/unionmade/Images/Products/Rose_Print_Button_Down_in_Shirt_in_Orange_0.jpg" | |
| # find out 1st position different with weburl | |
| weburl6 = "https://canoeonline.net/shop/category/furniture" | |
| imgurl6 = "/img/products/thumbnails/882.jpg" | |
| # rfind('/') | |
| weburl7 = "http://www.lagarconne.com/store/category.htm?sid=24" | |
| imgurl7 = "http://www.lagarconne.com/data/item/19179/imgalt/" | |
| # remove http | |
| weburl8 = "http://www.stevenalan.com/womens-clothing-tops-and-blouses/womens-clothing-tops-and-blouses,default,sc.html" | |
| imgurl8 = "//s7d9.scene7.com/is/image/StevenAlan/S13_3_WST0193_B113_PD?$redesigngrid$" | |
| # [@data-original | |
| weburl9 = "http://www.anthropologie.com/anthro/category/lounge+%26+intimates/clothes-loungewear.jsp" | |
| imgurl9 = "http://images.anthropologie.com/is/image/Anthropologie/26250498_066_b?$$RD2012_category_item$$" | |
| # remove .com | |
| weburl10 = "http://www.urbanexcess.com/c-1176-jeans-chinos.aspx" | |
| imgurl10 = "/images/PRODUCT/icon/8412622_1.jpg" | |
| weburl11 = "http://www.sixpack.fr/shop/" | |
| imgurl11 = "http://www.sixpack.fr/shop/img/p/1680-3591-medium.jpg" | |
| # unknown | |
| weburl12 = "http://www.supremenewyork.com/previews/springsummer2013/top-sweaters/s-s-checkered-henley" | |
| imgurl12 = "http://d2flb1n945r21v.cloudfront.net/production/uploaded/preview/60956/0-KN5_yellow.jpg-zoom_1361187988.jpg" | |
| weburl13 = "http://needsupply.com/mens/brands/billykirk" | |
| imgurl13 = "http://cdn.needsupply.com/media/catalog/product/cache/1/small_image/220x282/e9607dc71bc010050ca2ae6f644b84c1/1/0/1001176_1.jpg" | |
| #imgxpath = xpath_imgrequest(imgurl1,weburl1) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from get_price_test import get_price | |
| from imagexpath_get import xpath_imgrequest | |
| from xpath_compare import xpath_compare | |
| from get_price_test import price_get | |
| from get_price_test import currency_get | |
| weburl2 = "http://www.manufactum.com/sweaters-c193633/" | |
| imgurl2 = "http://images.manufactum.de/manufactum/thumbs_188/84498_1.jpg" | |
| def main_process(weburl, imgurl): | |
| urls = [] | |
| results = [] | |
| finprice = {} | |
| fincurrency = {} | |
| results = get_price(weburl) | |
| for result in results: | |
| finprice.update({result[2]: result[1]}) | |
| fincurrency.update({result[2]: result[0]}) | |
| urls.append(result[2]) | |
| imgxpath = xpath_imgrequest(imgurl, weburl) | |
| close_xpath = [] | |
| close_xpath = xpath_compare(imgxpath, urls) | |
| for xpath in close_xpath: | |
| print finprice[xpath] | |
| print fincurrency[xpath] | |
| print xpath | |
| main_process(weburl2, imgurl2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def price_check(price): | |
| currencytail = "EUR|Euro|RMB|GBP|JPY|CAD|AUD|USD" | |
| curhead = "[€,¥,£,$]" | |
| curhead = unicode(curhead,'utf-8') | |
| flag = False | |
| reg4= curhead+"?.[0-9\,\.]{1,}" | |
| m4=re.search(reg4,price) | |
| reg5="^[0-9\.\,]{1,}?.("+currencytail+")$" | |
| m5=re.search(reg5,price) | |
| reg6 ="^"+curhead+"[0-9\,\.]{1,}("+currencytail+")$" | |
| m6=re.search(reg6,price) | |
| reg1 = "^"+curhead+"$" | |
| m1=re.search(reg1,price) | |
| reg2 = "^("+currencytail+")$" | |
| m2=re.search(reg2,price) | |
| reg3 = "^"+curhead+"."+"("+currencytail+")$" | |
| m3 = re.search(reg3,price) | |
| ''' | |
| if m4!=None: | |
| print "m4" + m4.group(0) | |
| elif m5!=None: | |
| print "m5" + m5.group(0) | |
| elif m6!=None: | |
| print "m6" + m6.group(0) | |
| ''' | |
| if m1!=None: | |
| flag = True | |
| #print "m1" + m1.group(0) | |
| symbol = price | |
| elif m2!= None: | |
| flag = True | |
| #print "m2" + m2.group(0) | |
| currency = price | |
| elif m3!= None: | |
| flag = True | |
| #print "m3" + m3.group(0) | |
| else: | |
| #print "some else" | |
| flag = False | |
| #print price | |
| return flag |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def price_get(pri): | |
| price = "No price" | |
| pricetem = "" | |
| if pri != None: | |
| pricetem = pri.split() | |
| for tem in pricetem: | |
| m1 = re.search('^[0-9\.\,]{3,}$',tem) | |
| m2=re.search('^[A-Z]{1,3}[a-z]{,3}$',tem) | |
| m4=re.search('^.[0-9\,\.]{3,}$',tem) | |
| if m1!= None: | |
| price = tem | |
| if m1!= None and m2 == None and m4 == None: | |
| price = tem | |
| elif m4 != None and m2 == None and m1 == None: | |
| price =tem[1:] | |
| symbol = tem[:1] | |
| return price |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
price_get.py
This file contains one function price_get.
price_get(): The input is a list of prices. There is a if loop to split input prices.
There is a for loop check 3 different parsers. Check around price itself, determine the candidate formate of price. It confused me because the variable name, also, there should be some explanation for "re" statement.
It doesn't handle "else" condition. So people hardly know the reason of a Null output.
price_check.py
This file contains one function price_check.
price_check(): The input is a list of candidate prices. There is a if loop to go through all the re statements.
For re statements, there should be a basic explanation for them.
xpath_compare.py
This file contains one function xpath_compare.
xpath_compare(): The input are string image xpath and list all pricexpath/status xpath; output list of price/status xpath with closest xpath distance.
For two if loop within the function. There is no else statement which would make a null output cannot be tracked. Two for loop have similar structure which could merge in one loop to improve performance.
xpath_filter.py
This file contains one function xpath_filter.
It parsers candidate prices to determine which should be real prices.
currency_get.py
This file contains one function currency_get.
currency_get():The input is a string. Output is the currency symbol of the string.
It doesn't handle "else" condition. So people hardly know the reason of a Null output.