Skip to content

Instantly share code, notes, and snippets.

@ysc8620
Created May 8, 2013 23:27
Show Gist options
  • Select an option

  • Save ysc8620/5544458 to your computer and use it in GitHub Desktop.

Select an option

Save ysc8620/5544458 to your computer and use it in GitHub Desktop.
as
/.idea
*backup
*.bak
*tpl.php
/.*
*.zip
*.pyc
{"comment": ["\u4eba\u5230\u4e2d\u5e74\u7684\u7b80\u5357\u4fca\u662f\u4e2a\u4e0d\u592a\u6210\u529f\u7684\u751f\u610f\u4eba\uff0c\u62e5\u6709\u4e00\u4e2a\u5178\u578b\u7684\u4e2d\u4ea7\u9636\u7ea7\u5bb6\u5ead\uff0c\u4ed6\u4e0e\u59bb\u5b50\u548c\u4e24\u4e2a\u5b69\u5b50\uff0c\u4ee5\u53ca\u5cb3\u6bcd\u4f4f\u5728\u53f0\u5317\u4e00\u95f4\u666e\u901a\u516c\u5bd3\u623f\u5b50\u91cc\u3002\u59bb\u5b50\u662f\u4e00\u4e2a\u8106\u5f31\u7684\u5973\u4eba\uff0c\u56e0\u4e3a\u6bcd\u4eb2\u7684\u75c5\u800c\u5fc3\u529b\u4ea4\u7601\u3002\u5c0f\u513f\u5b50\u513f\u5b50\u53ea\u670910\u5c81\u5374\u975e\u5e38\u65e9\u719f\uff0c\u559c\u6b22\u62cd\u6444\u4eba\u7684\u80cc\u5f71\u548c\u63d0\u95ee\u54f2\u5b66\u95ee\u9898\u3002\u5927\u5973\u513f\u662f\u4e00\u4e2a\u97f3\u4e50\u5b66\u751f\uff0c\u56e0\u9677\u5165\u4e86\u9519\u8bef\u7684\u7231\u60c5\u800c\u5f00\u59cb\u5c1d\u5230\u4eba\u751f\u7684\u82e6\u6da9\u3002\u4e00\u6b21\u5728\u5c0f\u8205\u5b50\u7684\u5a5a\u793c\u4e0a\uff0c\u7b80\u5357\u4fca\u9047\u5230\u4e86\u5e74\u8f7b\u65f6\u7684\u5973\u53cb\uff0c\u91cd\u65b0\u71c3\u8d77\u4e86\u4e45\u8fdd\u7684\u7231\u60c5\u2026\u2026", "2000\u5e74\uff0c\u662f\u4e9a\u6d32\u7535\u5f71\u5927\u4e30\u6536\u7684\u4e00\u5e74\uff0c5\u6708\u4e3e\u884c\u7684\u621b\u7eb3\u7535\u5f71\u8282\u51e0\u4e4e\u6210\u4e86\u201c\u4e9a\u6d32\u7535\u5f71\u7684\u8282\u65e5\u201d\uff0c\u5728\u8fd9\u6b21\u7535\u5f71\u8282\u4e0a\uff0c\u300a\u82b1\u6837\u5e74\u534e\u300b\u83b7\u5f97\u4e86\u6700\u4f73\u5f71\u7247\u3001\u6700\u4f73\u7537\u4e3b\u89d2\u4e24\u9879\u5927\u5956\uff0c\u5bfc\u6f14\u738b\u5bb6\u536b\u5927\u51fa\u98ce\u5934\uff0c\u800c\u300a\u4e00\u4e00\u300b\u7684\u5bfc\u6f14\u6768\u5fb7\u660c\u751a\u81f3\u6bd4\u738b\u5bb6\u536b\u66f4\u52a0\u5f15\u4eba\u6ce8\u76ee\uff0c\u56e0\u4e3a\u4ed6\u83b7\u5f97\u4e86\u5c5e\u4e8e\u5bfc\u6f14\u7684\u6700\u9ad8\u8363\u8a89\u2014\u2014\u6700\u4f73\u5bfc\u6f14\u5956\u3002\u4f17\u591a\u7684\u89c2\u4f17\u4e3a\u4ed6\u7684\u8fd9\u90e8\u590d\u6742\u3001\u7ec6\u81f4\u800c\u4f18\u96c5\u7684\u5f71\u7247\u800c\u503e\u5012\uff0c\u5e76\u5bf9\u534e\u8bed\u7535\u5f71\u4ea7\u751f\u4e86\u6781\u5927\u5174\u8da3\u3002\u300a\u4e00\u4e00\u300b\u4e5f\u6210\u529f\u5730\u8fdb\u5165\u4e86\u7f8e\u56fd\u5e02\u573a\uff0c\u6210\u4e3a\u88ab\u7f8e\u56fd\u666e\u901a\u89c2\u4f17\u6240\u770b\u5230\u7684\u7b2c\u4e00\u90e8\u6768\u5fb7\u660c\u5bfc\u6f14\u7684\u5f71\u7247\uff0c\u6210\u4e3a\u4ed6\u7684\u7535\u5f71\u5927\u6b65\u8fc8\u8fdb\u66f4\u5e7f\u9614\u7684\u56fd\u9645\u5e02\u573a\u7684\u7b2c\u4e00\u6b65\u3002"], "title": "\u4e00\u4e00", "url": "http://www.ffdy.cc/movie/10450.html", "leading": ["\u5434\u5ff5\u771f", "\u91d1\u71d5\u73b2", "Issei Ogata", "Kelly Lee (II)", "Jonathan Chang", "Hsi-Sheng Chen", "Su-Yun Ko", "Michael Tao", "\u8427\u6dd1\u614e", "Adrian Lin", "Pang Chang Yu", "Ru-Yun Tang", "Shu-Yuan Hsu", "Hsin-Yi Tseng", "\u9648\u4ee5\u6587", "Tang Congsheng"], "area": "\u4e2d\u56fd\u53f0\u6e7e", "detail_pic": "http://img.kankanba.com/cs/250X350/2/cbe3d833e70d0a44b26ff5cf639fdcc2.jpg", "director": ["\u6768\u5fb7\u660c"], "show_day": "2000-05-14 \u6cd5\u56fd", "type": ["\u5267\u60c5"]}
__author__ = 'ShengYue'
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
<linkRules>
<rule type="reg" value="(type/movie|movie)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/movie/(\d+).html" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
Wed, 10 Apr 2013 20:10:52 log.py[line:21] INFO 网站读取完成
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import wx
from lxml import etree
from index import main
class DemoFrame(wx.Frame):
def __init__(self):
self.cateList = []
wx.Frame.__init__(self, None, -1, u"load goods",size=(400,200))
self.draw()
def draw(self):
self.panel = wx.Panel(self, -1)
wx.StaticText(self.panel, -1, u"输入网址:", (15, 15))
wx.StaticText(self.panel, -1, u"选择分类:", (15, 50))
sampleList = self.getCate()
self.getCateList(sampleList)
self.cate = wx.ComboBox(self.panel, -1, self.cateList[0], (80, 50), wx.DefaultSize, self.cateList)
self.text = wx.TextCtrl(self.panel,-1,value='',pos=(80,15),size=(300,24))
self.button = wx.Button(self.panel, -1, u"抓取", pos=(15, 90))
self.Bind(wx.EVT_BUTTON, self.OnClick, self.button)
def OnClick(self, event):
self.button.SetLabel(u'抓取中...')
self.button.Enable(False)
index = main()
bool = index.init(self.text.GetValue(),self.cate.GetValue())
if bool:
self.button.SetLabel(u'抓取')
self.button.Enable(True)
def getCateList(self, cate):
for s in cate:
if type(s) == type([]):
self.getCateList(s)
else:
self.cateList.append(s)
def getCate(self, xpath=None, p=''):
ret=[]
if xpath == None:
xtree = etree.parse(open('cate.xml'))
cates = xtree.xpath('/root/cate')
for cate in cates:
row = cate.getchildren()
if row :
ret.append(p+cate.get('name'))
ret.append(self.getCate(row, (p+cate.get('name')+'->')))
else:
ret.append(p+cate.get('name'))
else:
for cate in xpath:
row = cate.getchildren()
if row:
ret.append(p+cate.get('name'))
ret.append(self.getCate(row,(p+cate.get('name')+'->')))
else:
ret.append(p+cate.get('name'))
return ret
app = wx.PySimpleApp()
frame = DemoFrame()
frame.Show()
app.MainLoop()
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from os.path import join, getsize
from model.curl import curl
import csv
import re
import string
import re
header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
"col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
"col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片" )
class main:
def init(self, url, cate):
self.curl = curl()
html = self.curl.read(url)
#fop = open('./html.html')
#print getsize('./html.html')
#fop.write(html)
#try:
# html = fop.read(getsize('./html.html'))
# #html = self.curl.mdcode(html)
#finally:
# fop.close()
#print html
data = {}
xtree = etree.HTML(html)
# 标题
title = xtree.xpath('//h1')
data['name'] = string.strip(title[0].text)
#价格
price = xtree.xpath('//span[@id="ECS_SHOPPRICE"]')
data['price'] = string.strip(price[0].text)
#原价
oldprice = xtree.xpath('//span[@class="xline"]')
oldprice = re.findall(re.compile('[\d.]*'), string.strip(oldprice[0].text))
data['oldprice'] = oldprice[1]
#品牌
#brand = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/div[3]/span[2]/a')
#data['brand'] = string.strip(brand[0].text)
#货号
huohao = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/p/span[2]')
data['ibn'] = string.strip(huohao[0].text)
#大图片
bimg = xtree.xpath('//*[@id="thumg"]')
imgurl = string.strip(bimg[0].get('src'))
data['bimg'] = self.curl.down(imgurl)
#大图片
data['simg'] = data['bimg']
#详细
dest = xtree.xpath('//div[@class="deszone"]/div[@class="zones"]')
des = etree.tostring(dest[0], encoding='utf-8')
#data['des'] = des
reg = re.compile('\s',re.I)
s = reg.subn(' ', des)
data['des'] = s[0]
data['des'] = data['des'].replace( 'src2','src')
#print data['des']
#下载所有图片
#ireg = re.compile("<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*(\?<imgUrl>[^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")
imgreg = re.compile(r"<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*([^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")
ilist = imgreg.findall(data['des'])
for img in ilist:
try:
print u'下载'+img
new = self.curl.down(img)
data['des'] = data['des'].replace( img,new)
except:
print u'下载失败'+img
header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
"col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
"col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片")
#print cate
#cate = ''
# 拼字段
row = (self.curl.mdcode('通用商品类型'), self.curl.mdcode(data['ibn']),'',self.curl.mdcode(cate),'',self.curl.mdcode(data['oldprice']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['name']),'Y','',self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['simg']),self.curl.mdcode(data['bimg']),'',self.curl.mdcode(data['des']),'0.000','','','',self.curl.mdcode(data['bimg']),self.curl.mdcode(data['simg']))
fop = open('tmp.csv','w+')
writer = csv.writer(fop)
writer.writerow(header)
writer.writerow(row)
print u'完成'
fop.close()
return True
#mai = main()
#mai.init()
#curls = curl()
#curls.down('http://www.msex.com/static/upload/1303121657296625.jpg',{})
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import urllib2
import time
import random
import os.path
import urllib
from log import log
class curl:
# 链接表
urlList = {}
req = None
#字符编码处理
def mdcode(self, data):
# code = chardet.detect(data)
#return data.decode(code['encoding'])
for c in ('utf-8', 'gbk', 'gb2312'):
try:
return data.decode(c)
except:
pass
#
# for c in ('utf-8', 'gbk', 'gb2312'):
# try:
# return data.encode( 'utf-8' )
# except:
# pass
#
# return data
#
def getBaseUrl(self, base_url, link):
print ''
def read(self,url, config={}):
try:
url = urllib.unquote(url)
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0'}
self.req = urllib2.Request(url,headers=header)
# 添加头信息
for key in config:
self.req.add_header(key, config[key])
res = urllib2.urlopen(self.req)
html = res.read()
res.close()
# code = chardet.detect(html)
return self.mdcode(html)
except:
print u'获取HTML失败'
return ''
def getFileName(self):
return time.strftime('%y%m%d%H%I',time.localtime(time.time()))+'-'+ str(random.randint(10,99))+'-'+str(random.randint(10,99))
def down(self,url):
ext = os.path.splitext(url)[-1]
socket = urllib2.urlopen(url)
data = socket.read()
fileName =self.getFileName()+ext
with open( './images/'+fileName, "wb") as jpg:
jpg.write(data)
socket.close()
return '/uploads/images/'+fileName
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import MySQLdb
import hashlib
db_host = '127.0.0.1'
db_name = 'root'
db_passwd = 'LEsc2008'
db_dbname = 'python'
db_port = 3306
class db:
#self.conn = None
def __init__(self):
try:
self.conn = MySQLdb.connect(host=db_host,user=db_name,passwd=db_passwd,port=db_port,use_unicode=True, charset='utf8')
self.cur = self.conn.cursor()
#print self.cur
'''创建数据库 如果数据库不存在'''
#count = self.cur.execute("create database if not exists %s", db_dbname)
#print count
self.conn.select_db(db_dbname)
#self.cur.execute("SET NAMES utf8")
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
'''
获取网站下连接
'''
def get_url(self, web_name):
self.cur.execute("SELECT * FROM links WHERE web_name = %s AND status=0", web_name)
return self.cur.fetchone()
'''
持久化连接
'''
def add_url(self, link, web_name):
md5 = hashlib.md5(link).hexdigest()
print link
self.cur.execute("INSERT INTO links(`link`, `web_name`, `md5`)VALUES(%s, %s, %s)", [link, web_name, md5])
self.conn.commit()
'''
检测连接是否存在
'''
def check_url(self, link):
md5 = mdb5 = hashlib.md5(link).hexdigest()
return self.cur.execute("SELECT * FROM links WHERE `md5`=%s", md5)
#return self.cur.fetchone()
def update_url(self, id):
self.cur.execute("UPDATE links SET status = 1 WHERE id=%s", id);
self.conn.commit()
return True
def add_star(self, director):
#print director
count = self.cur.execute("SELECT id FROM star WHERE name=%s", director)
if count == 0:
self.cur.execute("INSERT INTO star(name)VALUES(%s)",director)
id = self.conn.insert_id()
self.conn.commit()
return str(id)
else:
star = self.cur.fetchone()
return str(star[0])
def addData(self,data):
#print data
### 增加导演
director = ''
try:
for daoyan in data['director']:
director += ','+self.add_star(daoyan)
director = director.strip(',')
except:
director = ''
### 增加主演
leading = ''
try:
for lead in data['leading']:
leading += ','+self.add_star(lead)
leading = leading.strip(',')
except:
leading = ''
### 简介
comment = ''
try:
for comm in data['comment']:
comment += comm
except:
comment = '';
#标题, 图片, 链接
inserData = [data['title'], data['detail_pic'],data['url'],director,leading,data['area'],data['show_day'],comment]
self.add_movie(inserData)
def add_movie(self,insertData):
self.cur.execute("INSERT INTO movie(`title`,`img`,`url`,`director`,`leading`,`area`,`show_day`,`comment`)VALUES(%s, %s, %s, %s, %s, %s, %s, %s)", insertData)
self.conn.commit()
'''
关闭数据库
'''
def close(self):
try:
self.cur.close()
self.conn.close()
except:
pass
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import logging
class log:
@staticmethod
def read(file):
try:
fopen = open(file, 'r')
data = fopen.read()
fopen.close()
return data
except:
pass
@staticmethod
def write(file, logs):
try:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%a, %d %b %Y %H:%M:%S', filename=file,filemode='w')
logging.info(logs)
except Exception, e:
print '-----------', Exception, e
pass
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import lxml
import lxml.etree
from lxml.html.clean import Cleaner
import re
class match:
'''
修复HTML
创建XPATH对象
'''
def __init__(self, html, url):
cleaner = Cleaner(style=True, scripts=True,page_structure=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
del cleaner
self.etree = lxml.html.fromstring(html)
self.etree.make_links_absolute( base_url=url, resolve_base_href=True)
'''
获取所有可以匹配链接
'''
def get_all_links(self, link_match, url):
links = []
all_links = self.etree.xpath('//a')
for match in link_match:
regLink = re.compile(url+match.get('value'))
for a in all_links:
try:
href = a.get('href')
except:
continue;
if regLink.match(href) != None:
links.append(href)
#else:
#print '失败', a.get('href')
del all_links
return links
'''
获取所有需要查询的信息
'''
def get_match_info(self, match, url=None):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
data[name] = infoxpath[0].strip()
elif nodetype == 'array':
arr = []
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
print xpath,u'读取不出来'
return data
def match_tiantang(self, match, url):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
data[name] = infoxpath[0].strip()
elif nodetype == 'array':
arr = []
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
#log.write('system.log',xpath+u'读取不出来')
print xpath,u'读取不出来'
'''
获取所有需要查询的信息
'''
def get_match_info_test(self, match, url=None):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
if infoxpath != []:
data[name] = infoxpath[0].strip()
else:
data[name] = ''
elif nodetype == 'array':
arr = []
if infoxpath == []:
data[name] = arr
else:
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
if infoxpath == []:
data[name] = ''
else:
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
#log.write('system.log',xpath+u'读取不出来')
print xpath,u'读取不出来'
return data
# print self.etree.xpath('//h1/text()')[0]
# print self.etree.xpath('//h1/em/text()')[0]
# com = self.etree.xpath("//div[@class='filmcontents']/node()/text()|//div[@class='filmcontents']/text()")
# s = ''
# for c in com:
# s = s+ c
# print s
# #规则学习
# d = self.etree.xpath(u"//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()")
# print d[0]
def close(self):
del self.etree
Mon, 08 Apr 2013 22:17:20 log.py[line:25] INFO 网站读取完成
# mysetup.py
from distutils.core import setup
import py2exe
setup(options = {"py2exe":{"dll_excludes":["MSVCP90.dll",'lxml.dll'], }},windows=[{"script": "frame.py"}])
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from model.db import db
from model.curl import curl
from model.match import match
from model.log import log
import re
import time
i = 1
'''
爬虫
'''
class spiderling:
def __init__(self, config):
self.i = 0
try:
configtree = etree.ElementTree(file=config)
# 获取网站属性
sites = configtree.xpath('//site')
site = sites[0]
self.url = site.get('url')
self.site_name = site.get('siteName')
self.daily = float(site.get('daily'))
self.log = site.get('log')
self.errlog = site.get('error')
self.linkRule = configtree.xpath('//linkRules/rule')
self.infoUrlRule = configtree.xpath('//urlRules/rule')
self.infoRule = configtree.xpath('//targets/target/model/field')
except:
log.write('error.log', u'配置文件读取错误')
self.db = db()
def run(self, url):
#休息时间
time.sleep(self.daily)
if url == None:
info = self.db.get_url(self.site_name)
if info == None:
log.write(self.log, u'网站读取完成')
return 0;
self.db.update_url(info[0])
url = info[1]
gurl = curl()
html = gurl.read(url)
try:
if html.strip() == '':
s = None;
self.run(s)
except Exception, e:
log.write(self.log, url+u' html 获取失败'+Exception+e)
s = None;
self.run(s)
#print html
self.xtree = match(html, url)
links = self.xtree.get_all_links(self.linkRule, self.url)
'''把获取到的连接持久化'''
for link in links:
if self.db.check_url(link) == 0:
self.db.add_url(link, self.site_name)
'''如果当前连接是详细页则正则所需内容'''
#for infoxpath in self.infoRule:
#self.xtree.get_match_info(self.infoRule)
regInfoLink = re.compile(self.infoUrlRule[0].get('value'))
if regInfoLink.match(url) <> None:
self.i = self.i+1
data = self.xtree.get_match_info(self.infoRule, url)
self.db.addData(data)
#
# file_object = open(str(self.i)+'id.txt', 'w')
# file_object.write(json.dumps(data))
# file_object.close()
#
#print json.dumps(data)
else:
print u'不是详细也不需要解析'
s = None
self.run(s)
def close(self):
try:
self.xtree.close()
except:
pass
try:
self.db.close()
except:
pass
sp = spiderling('cate.xml')
#return
sp.run(sp.url)
#sp.run('http://www.ffdy.cc/movie/35622.html')
sp.close()
#import sqlite3 #导入模块
#cx = sqlite3.connect("d:\\test.db")
#
#cu=cx.cursor()
##cu.execute("""create table catalog ( id integer primary key, pid integer, name varchar(10) UNIQUE )""")
##
##cu.execute(u"insert into catalog values(2, 0, '哈哈')")
##cu.execute(u"insert into catalog values(3, 0, '我是中国')")
##cx.commit()
#
#cu.execute("select * from catalog")
#d = cu.fetchall()
#for s in d:
# print s[2]
#cu.close()
#cx.close()
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from model.db import db
from model.curl import curl
from model.match import match
import re
import time
import lxml
i = 1
'''
爬虫
'''
class spiderling:
def __init__(self, config):
self.i = 0
configtree = etree.ElementTree(file=config)
site = configtree.xpath('//site')
self.url = site[0].get('url')
self.site_name = site[0].get('siteName')
self.linkRule = configtree.xpath('//linkRules/rule')
self.infoUrlRule = configtree.xpath('//urlRules/rule')
self.infoRule = configtree.xpath('//targets/target/model/field')
#print self.linkRule[0].get('value')
self.db = db()
def run(self, url):
time.sleep(0.3)
if url == None:
info = self.db.get_url(self.site_name)
if info == None:
print u'爬虫完成'
return 0;
self.db.update_url(info[0])
url = info[1]
gurl = curl()
html = gurl.read(url)
try:
if html.strip() == '':
s = None;
self.run(s)
except:
s = None;
self.run(s)
#print html
self.xtree = match(html, url)
d = self.xtree.etree.xpath("//div[@class='filmcontents']")
sd = etree.tostring(d[0],encoding="utf-8",method="html")
sd = sd.strip()
print sd
print '================================='
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
ds = reg.sub(' ',sd).strip()
print ds
return
links = self.xtree.get_all_links(self.linkRule, self.url)
'''把获取到的连接持久化'''
for link in links:
if self.db.check_url(link) == 0:
self.db.add_url(link, self.site_name)
'''如果当前连接是详细页则正则所需内容'''
#for infoxpath in self.infoRule:
#self.xtree.get_match_info(self.infoRule)
regInfoLink = re.compile(self.infoUrlRule[0].get('value'))
if regInfoLink.match(url) <> None:
self.i = self.i+1
print u'是详细页需要解析', str(self.i)
data = self.xtree.get_match_info_test(self.infoRule, url)
print u'插入数据', url
self.db.addData(data)
#
# file_object = open(str(self.i)+'id.txt', 'w')
# file_object.write(json.dumps(data))
# file_object.close()
#
#print json.dumps(data)
else:
print u'不是详细也不需要解析'
s = None
self.run(s)
def close(self):
self.xtree.close()
self.db.close()
#sp = spiderling('cate.xml')
#sp.run(sp.url)
#sp.run('http://www.ffdy.cc/movie/35622.html')
#sp.close()
url = 'http://www.dytt8.net/html/gndy/dyzz/20130407/41866.html'
curls = curl()
html = curls.read(url,{})
xtree = match(html, url)
content = xtree.etree.xpath('//div[@id="Zoom"]')
infohtml = lxml.etree.tostring(content[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b|\ba\b))+\b\s*[^>]*>')
infohtml = reg.sub(' ',infohtml).strip()
pattern = re.compile(r'◎年  代 ([^<]*)')
ds= pattern.search(html)
print
if(ds==[]):
print u'找不到'
else:
print ds[0]
print infohtml
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
<linkRules>
<rule type="reg" value="(type/movie|movie)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/movie/(\d+).html" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment