Skip to content

Instantly share code, notes, and snippets.

@XiaoyuShan
Last active December 16, 2015 10:49
Show Gist options
  • Select an option

  • Save XiaoyuShan/5422669 to your computer and use it in GitHub Desktop.

Select an option

Save XiaoyuShan/5422669 to your computer and use it in GitHub Desktop.
These are different methods run in Main.py to get the right price by finding closest xpath distance
import re
def currency_get(price):
currency_map = {'€':'EUR','¥':'JPY',u'\xa3':'GBP','$':'USD','Euro':'EUR'}
currencytail = "EUR|Euro\w|RMB|GBP|JPY|AUD|USD"
pricetem = price.split()
currency = ""
symbol = ""
error = ""
for tem in pricetem:
m1 = re.search('^[0-9\.\,]{3,}$',tem)
m2=re.search('^[A-Za-z]{1,4}$',tem)
m3=re.search('^[€,¥,£,$]{1,2}',tem)
m2=re.search('^(EUR|Euro|RMB|GBP|JPY|AUD|USD)',tem)
m4=re.search('^.[0-9\.\,]{3,}$',tem)
if m1!= None:
price = tem
elif m4 != None:
price =tem[1:]
symbol = tem[:1]
if currency_map.has_key(symbol) == True:
currency = currency_map[symbol]
elif m2 != None:
currency = tem
elif m3!=None:
if currency_map.has_key(tem)== True:
currency = currency_map[tem]
#else:
#print "error"
return currency
#from bottle import route, run
import lxml.html as HTML
import lxml.etree as etree
import difflib
from difflib import SequenceMatcher
import urllib2
import cookielib
from pprint import pprint
import re
from price_check import price_check
from price_get import price_get
from xpath_filter import xpath_filter
from currency_get import currency_get
#@route('/xpath/<weburl:path>,<imgurl:path>')
#run (host='localhost',port = 8080, debug = True)
def get_price(weburl):
final_list = []
keyresults=[]
truexpath = ""
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
f = opener.open(weburl)
s = f.read()
#s = unicode(s, 'utf-8')
f.close()
hdoc = HTML.fromstring(s)
htree = etree.ElementTree(hdoc)
#search xpath contain currency symbol
keys = xpath_filter()
for key in keys:
key_locations = htree.xpath("//*[re:match(text(),'"+key+"')]", namespaces={"re": "http://exslt.org/regular-expressions"})
for Elem_location in key_locations:
keyresults.append(htree.getpath(Elem_location))
if len(keyresults)!=False:
break
# get price and currency
for result in keyresults:
truexpath = result
final_currency=""
final_price= ""
final_symbol=""
if truexpath.find('script')==-1:
#print truexpath
modxpath = []
modxpath.append(truexpath[:truexpath.rfind('/')+1]+"*")
modxpath.append(truexpath[:truexpath.rfind('/')])
temxpath = truexpath[:truexpath.rfind('/')]
modxpath.append(temxpath[:temxpath.rfind('/')+1]+"*")
modxpath.append(temxpath[:temxpath.rfind('/')])
flag = False
locations = htree.xpath(truexpath)
for Elem_location in locations:
price = Elem_location.text
if price!=None:
final_currency = currency_get(price)
flag = price_check(price)
#print flag
#print price
if flag == False:
if price_get(price)!= "No price" and price_get(price)!= None:
final_price = price_get(price)
else:
break
if flag == True:
for mod in modxpath:
loc = htree.xpath(mod)
for Elem_location in loc:
price = Elem_location.text
price_get(price)
#print price
if price_get(price)!="No price":
final_price = price_get(price)
break
final =[]
#print final_currency
#print final_price
final.append(final_currency)
final.append(str(final_price))
final.append(truexpath)
final_list.append(final)
#print final_list
return final_list
'''
test cases:
url="http://www.oipolloi.com/apc-cable-knit-pullover-off-white"
url="http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id=26503334&parentid=M_APP_SHORTSSWIM_SHORTS"
url = "http://www.urbanexcess.com/c-1190-sweatshirts.aspx"
url="http://www.freepeople.com/whats-new-intimates/"
url = "http://www.manufactum.com/sweaters-c193633/"
url = "http://www.etsy.com"
url ="http://www.urbanoutfitters.com/urban/catalog/category.jsp?id=BRANDS&brand=rothco"
url="http://www.zara.com/webapp/wcs/stores/servlet/category/us/en/zara-nam-S2013/358056/Trousers"
url="http://www.zara.com/webapp/wcs/stores/servlet/product/us/en/zara-nam-S2013/358080/1232023/JACKET+WITH+PATCHES"
url="http://www.etsy.com/listing/106835119/micro-stoneware-bowl-white-fine-bone?ref=fp_treasury_1"
get_price(url)
'''
import sys,re,urllib2,cookielib,urllib
from lxml import etree
import lxml.html as HTML
import lxml.etree as etree
import difflib
from pprint import pprint
from difflib import SequenceMatcher
import re
# get image xpath
def xpath_imgrequest(imgurl, weburl):
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
f = opener.open(weburl)
s = f.read()
f.close()
hdoc = HTML.fromstring(s)
htree = etree.ElementTree(hdoc)
xpaths=[]
imgresults=[]
xpath_flag = False
xpaths = xpath_search(imgurl,weburl)
for xpath in xpaths:
image_locations = htree.xpath('//img[contains(@src, "'+xpath+'" )]')
if len(str(image_locations))>2:
xpath_flag = True
break
if xpath_flag == False:
for xpath in xpaths:
image_locations = htree.xpath('//img[contains(@data-original, "'+xpath+'" )]')
if len(str(image_locations))>2:
break
for Elem_location in image_locations:
imgresults.append(htree.getpath(Elem_location))
for result in imgresults:
return result
def xpath_search(imgurl,weburl):
#image_locations = htree.xpath("//*[re:match(text(),'"+imgurl+"')]", namespaces={"re": "http://exslt.org/regular-expressions"})
imgurls = []
imgurltem = imgurl
imgurls.append(imgurltem)
imgurltem = imgurl[imgurl.find('.com')+4:]
imgurls.append(imgurltem)
imgurltem = imgurl[:imgurl.rfind('/')+1]
imgurls.append(imgurltem)
imgurltem = imgurl[imgurl.find('http:')+5:]
imgurls.append(imgurltem)
s = difflib.SequenceMatcher(None, imgurl, weburl)
matchfirst = s.get_matching_blocks()[0]
position = matchfirst[2]
imgurltem = imgurl[position-1:]
imgurls.append(imgurltem)
return imgurls
imgurl1="http://www.mrporter.com/images/products/329468/329468_mrp_in_m2.jpg"
weburl1="http://www.mrporter.com/Shop/List/Paul_Smith_British_Classics?cm_sp=homepage-_-paulsmithm4-_-020413"
weburl2="http://www.urbanexcess.com/c-1190-sweatshirts.aspx"
imgurl2="http://www.urbanexcess.com/images/PRODUCT/icon/01526-320_1.jpg"
imgurl3 = "http://cdn.photojojo.net/store/awesomeness/productImages/spring-break-camera-strap-8984_600.0000001362858205.jpg"
weburl3 = "http://photojojo.com/store/awesomeness/spring-break-camera-strap/"
weburl4 = "http://www.oipolloi.com/apc-cable-knit-pullover-off-white"
imgurl4 = "http://www.oipolloi.com/cache/images/gallery/36236-1--450-auto.jpg"
weburl5 = "http://unionmadegoods.com/SPRING_COLLECTIONS_486.html"
#/Universal_Works_Rose_Print_Button_Down_in_Shirt_in_Orange_8749.html?reframed=1
imgurl5 = "cdn-fsg/unionmade/Images/Products/Rose_Print_Button_Down_in_Shirt_in_Orange_0.jpg"
# find out 1st position different with weburl
weburl6 = "https://canoeonline.net/shop/category/furniture"
imgurl6 = "/img/products/thumbnails/882.jpg"
# rfind('/')
weburl7 = "http://www.lagarconne.com/store/category.htm?sid=24"
imgurl7 = "http://www.lagarconne.com/data/item/19179/imgalt/"
# remove http
weburl8 = "http://www.stevenalan.com/womens-clothing-tops-and-blouses/womens-clothing-tops-and-blouses,default,sc.html"
imgurl8 = "//s7d9.scene7.com/is/image/StevenAlan/S13_3_WST0193_B113_PD?$redesigngrid$"
# [@data-original
weburl9 = "http://www.anthropologie.com/anthro/category/lounge+%26+intimates/clothes-loungewear.jsp"
imgurl9 = "http://images.anthropologie.com/is/image/Anthropologie/26250498_066_b?$$RD2012_category_item$$"
# remove .com
weburl10 = "http://www.urbanexcess.com/c-1176-jeans-chinos.aspx"
imgurl10 = "/images/PRODUCT/icon/8412622_1.jpg"
weburl11 = "http://www.sixpack.fr/shop/"
imgurl11 = "http://www.sixpack.fr/shop/img/p/1680-3591-medium.jpg"
# unknown
weburl12 = "http://www.supremenewyork.com/previews/springsummer2013/top-sweaters/s-s-checkered-henley"
imgurl12 = "http://d2flb1n945r21v.cloudfront.net/production/uploaded/preview/60956/0-KN5_yellow.jpg-zoom_1361187988.jpg"
weburl13 = "http://needsupply.com/mens/brands/billykirk"
imgurl13 = "http://cdn.needsupply.com/media/catalog/product/cache/1/small_image/220x282/e9607dc71bc010050ca2ae6f644b84c1/1/0/1001176_1.jpg"
#imgxpath = xpath_imgrequest(imgurl1,weburl1)
from get_price_test import get_price
from imagexpath_get import xpath_imgrequest
from xpath_compare import xpath_compare
from get_price_test import price_get
from get_price_test import currency_get
weburl2 = "http://www.manufactum.com/sweaters-c193633/"
imgurl2 = "http://images.manufactum.de/manufactum/thumbs_188/84498_1.jpg"
def main_process(weburl, imgurl):
urls = []
results = []
finprice = {}
fincurrency = {}
results = get_price(weburl)
for result in results:
finprice.update({result[2]: result[1]})
fincurrency.update({result[2]: result[0]})
urls.append(result[2])
imgxpath = xpath_imgrequest(imgurl, weburl)
close_xpath = []
close_xpath = xpath_compare(imgxpath, urls)
for xpath in close_xpath:
print finprice[xpath]
print fincurrency[xpath]
print xpath
main_process(weburl2, imgurl2)
import re
def price_check(price):
currencytail = "EUR|Euro|RMB|GBP|JPY|CAD|AUD|USD"
curhead = "[€,¥,£,$]"
curhead = unicode(curhead,'utf-8')
flag = False
reg4= curhead+"?.[0-9\,\.]{1,}"
m4=re.search(reg4,price)
reg5="^[0-9\.\,]{1,}?.("+currencytail+")$"
m5=re.search(reg5,price)
reg6 ="^"+curhead+"[0-9\,\.]{1,}("+currencytail+")$"
m6=re.search(reg6,price)
reg1 = "^"+curhead+"$"
m1=re.search(reg1,price)
reg2 = "^("+currencytail+")$"
m2=re.search(reg2,price)
reg3 = "^"+curhead+"."+"("+currencytail+")$"
m3 = re.search(reg3,price)
'''
if m4!=None:
print "m4" + m4.group(0)
elif m5!=None:
print "m5" + m5.group(0)
elif m6!=None:
print "m6" + m6.group(0)
'''
if m1!=None:
flag = True
#print "m1" + m1.group(0)
symbol = price
elif m2!= None:
flag = True
#print "m2" + m2.group(0)
currency = price
elif m3!= None:
flag = True
#print "m3" + m3.group(0)
else:
#print "some else"
flag = False
#print price
return flag
import re
def price_get(pri):
price = "No price"
pricetem = ""
if pri != None:
pricetem = pri.split()
for tem in pricetem:
m1 = re.search('^[0-9\.\,]{3,}$',tem)
m2=re.search('^[A-Z]{1,3}[a-z]{,3}$',tem)
m4=re.search('^.[0-9\,\.]{3,}$',tem)
if m1!= None:
price = tem
if m1!= None and m2 == None and m4 == None:
price = tem
elif m4 != None and m2 == None and m1 == None:
price =tem[1:]
symbol = tem[:1]
return price
@qz267
Copy link

qz267 commented Apr 20, 2013

price_get.py
This file contains one function price_get.
price_get(): The input is a list of prices. There is a if loop to split input prices.
There is a for loop check 3 different parsers. Check around price itself, determine the candidate formate of price. It confused me because the variable name, also, there should be some explanation for "re" statement.
It doesn't handle "else" condition. So people hardly know the reason of a Null output.

price_check.py
This file contains one function price_check.
price_check(): The input is a list of candidate prices. There is a if loop to go through all the re statements.
For re statements, there should be a basic explanation for them.

xpath_compare.py
This file contains one function xpath_compare.
xpath_compare(): The input are string image xpath and list all pricexpath/status xpath; output list of price/status xpath with closest xpath distance.
For two if loop within the function. There is no else statement which would make a null output cannot be tracked. Two for loop have similar structure which could merge in one loop to improve performance.

xpath_filter.py
This file contains one function xpath_filter.
It parsers candidate prices to determine which should be real prices.

currency_get.py
This file contains one function currency_get.
currency_get():The input is a string. Output is the currency symbol of the string.
It doesn't handle "else" condition. So people hardly know the reason of a Null output.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment