ysc8620 · May 8, 2013 23:27
diff --git a/spiderling_.gitignore b/spiderling_.gitignore
 /.idea
 *backup
 *.bak
 *tpl.php
 /.*
 *.zip
 *.pyc
diff --git a/spiderling_327id.txt b/spiderling_327id.txt
 {"comment": ["\u4eba\u5230\u4e2d\u5e74\u7684\u7b80\u5357\u4fca\u662f\u4e2a\u4e0d\u592a\u6210\u529f\u7684\u751f\u610f\u4eba\uff0c\u62e5\u6709\u4e00\u4e2a\u5178\u578b\u7684\u4e2d\u4ea7\u9636\u7ea7\u5bb6\u5ead\uff0c\u4ed6\u4e0e\u59bb\u5b50\u548c\u4e24\u4e2a\u5b69\u5b50\uff0c\u4ee5\u53ca\u5cb3\u6bcd\u4f4f\u5728\u53f0\u5317\u4e00\u95f4\u666e\u901a\u516c\u5bd3\u623f\u5b50\u91cc\u3002\u59bb\u5b50\u662f\u4e00\u4e2a\u8106\u5f31\u7684\u5973\u4eba\uff0c\u56e0\u4e3a\u6bcd\u4eb2\u7684\u75c5\u800c\u5fc3\u529b\u4ea4\u7601\u3002\u5c0f\u513f\u5b50\u513f\u5b50\u53ea\u670910\u5c81\u5374\u975e\u5e38\u65e9\u719f\uff0c\u559c\u6b22\u62cd\u6444\u4eba\u7684\u80cc\u5f71\u548c\u63d0\u95ee\u54f2\u5b66\u95ee\u9898\u3002\u5927\u5973\u513f\u662f\u4e00\u4e2a\u97f3\u4e50\u5b66\u751f\uff0c\u56e0\u9677\u5165\u4e86\u9519\u8bef\u7684\u7231\u60c5\u800c\u5f00\u59cb\u5c1d\u5230\u4eba\u751f\u7684\u82e6\u6da9\u3002\u4e00\u6b21\u5728\u5c0f\u8205\u5b50\u7684\u5a5a\u793c\u4e0a\uff0c\u7b80\u5357\u4fca\u9047\u5230\u4e86\u5e74\u8f7b\u65f6\u7684\u5973\u53cb\uff0c\u91cd\u65b0\u71c3\u8d77\u4e86\u4e45\u8fdd\u7684\u7231\u60c5\u2026\u2026", "2000\u5e74\uff0c\u662f\u4e9a\u6d32\u7535\u5f71\u5927\u4e30\u6536\u7684\u4e00\u5e74\uff0c5\u6708\u4e3e\u884c\u7684\u621b\u7eb3\u7535\u5f71\u8282\u51e0\u4e4e\u6210\u4e86\u201c\u4e9a\u6d32\u7535\u5f71\u7684\u8282\u65e5\u201d\uff0c\u5728\u8fd9\u6b21\u7535\u5f71\u8282\u4e0a\uff0c\u300a\u82b1\u6837\u5e74\u534e\u300b\u83b7\u5f97\u4e86\u6700\u4f73\u5f71\u7247\u3001\u6700\u4f73\u7537\u4e3b\u89d2\u4e24\u9879\u5927\u5956\uff0c\u5bfc\u6f14\u738b\u5bb6\u536b\u5927\u51fa\u98ce\u5934\uff0c\u800c\u300a\u4e00\u4e00\u300b\u7684\u5bfc\u6f14\u6768\u5fb7\u660c\u751a\u81f3\u6bd4\u738b\u5bb6\u536b\u66f4\u52a0\u5f15\u4eba\u6ce8\u76ee\uff0c\u56e0\u4e3a\u4ed6\u83b7\u5f97\u4e86\u5c5e\u4e8e\u5bfc\u6f14\u7684\u6700\u9ad8\u8363\u8a89\u2014\u2014\u6700\u4f73\u5bfc\u6f14\u5956\u3002\u4f17\u591a\u7684\u89c2\u4f17\u4e3a\u4ed6\u7684\u8fd9\u90e8\u590d\u6742\u3001\u7ec6\u81f4\u800c\u4f18\u96c5\u7684\u5f71\u7247\u800c\u503e\u5012\uff0c\u5e76\u5bf9\u534e\u8bed\u7535\u5f71\u4ea7\u751f\u4e86\u6781\u5927\u5174\u8da3\u3002\u300a\u4e00\u4e00\u300b\u4e5f\u6210\u529f\u5730\u8fdb\u5165\u4e86\u7f8e\u56fd\u5e02\u573a\uff0c\u6210\u4e3a\u88ab\u7f8e\u56fd\u666e\u901a\u89c2\u4f17\u6240\u770b\u5230\u7684\u7b2c\u4e00\u90e8\u6768\u5fb7\u660c\u5bfc\u6f14\u7684\u5f71\u7247\uff0c\u6210\u4e3a\u4ed6\u7684\u7535\u5f71\u5927\u6b65\u8fc8\u8fdb\u66f4\u5e7f\u9614\u7684\u56fd\u9645\u5e02\u573a\u7684\u7b2c\u4e00\u6b65\u3002"], "title": "\u4e00\u4e00", "url": "http://www.ffdy.cc/movie/10450.html", "leading": ["\u5434\u5ff5\u771f", "\u91d1\u71d5\u73b2", "Issei Ogata", "Kelly Lee (II)", "Jonathan Chang", "Hsi-Sheng Chen", "Su-Yun Ko", "Michael Tao", "\u8427\u6dd1\u614e", "Adrian Lin", "Pang Chang Yu", "Ru-Yun Tang", "Shu-Yuan Hsu", "Hsin-Yi Tseng", "\u9648\u4ee5\u6587", "Tang Congsheng"], "area": "\u4e2d\u56fd\u53f0\u6e7e", "detail_pic": "http://img.kankanba.com/cs/250X350/2/cbe3d833e70d0a44b26ff5cf639fdcc2.jpg", "director": ["\u6768\u5fb7\u660c"], "show_day": "2000-05-14 \u6cd5\u56fd", "type": ["\u5267\u60c5"]}
diff --git a/spiderling___init__.py b/spiderling___init__.py
 __author__ = 'ShengYue'
diff --git a/spiderling_cate.xml b/spiderling_cate.xml
 <?xml version="1.0" encoding="utf-8"?>
 <root>
  <site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
      <linkRules>
          <rule type="reg" value="(type/movie|movie)" />
      </linkRules>
      <targets>
          <target name="info">
              <urlRules>
                  <rule type="reg" value=".*/movie/(\d+).html" />
              </urlRules>
              <model dataType="array">
 					<field name="title">
 						<parsers>
 							<parser type="text" xpath="//h1/text()" />
 						</parsers>
 					</field>
 					<field name="url">
 						<parsers>
 							<parser type="pageurl" xpath="//h1/text()" />
 						</parsers>
 					</field>
 					<field name="detail_pic">
 						<parsers>
 							<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
 						</parsers>
 					</field>
                  <field name="director">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
                  <field name="leading">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="type">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="area">
                      <parsers>
                          <parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="show_day">
                      <parsers>
                          <parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期：']/../td[last()]/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="comment">
                      <parsers>
                          <parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
                      </parsers>
                  </field>
              </model>
          </target>
      </targets>
  </site>
 </root>
diff --git a/spiderling_ffdy.log b/spiderling_ffdy.log
 Wed, 10 Apr 2013 20:10:52 log.py[line:21] INFO 网站读取完成
diff --git a/spiderling_frame.py b/spiderling_frame.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 import wx
 from lxml import etree
 from index import main


 class DemoFrame(wx.Frame):
    def __init__(self):
        self.cateList = []
        wx.Frame.__init__(self, None, -1, u"load goods",size=(400,200))


        self.draw()

    def draw(self):
        self.panel = wx.Panel(self, -1)
        wx.StaticText(self.panel, -1, u"输入网址:", (15, 15))
        wx.StaticText(self.panel, -1, u"选择分类:", (15, 50))
        sampleList = self.getCate()
        self.getCateList(sampleList)

        self.cate = wx.ComboBox(self.panel, -1, self.cateList[0], (80, 50), wx.DefaultSize, self.cateList)

        self.text = wx.TextCtrl(self.panel,-1,value='',pos=(80,15),size=(300,24))

        self.button = wx.Button(self.panel, -1, u"抓取", pos=(15, 90))
        self.Bind(wx.EVT_BUTTON, self.OnClick, self.button)

    def OnClick(self, event):
        self.button.SetLabel(u'抓取中...')
        self.button.Enable(False)

        index = main()
        bool = index.init(self.text.GetValue(),self.cate.GetValue())
        if bool:
            self.button.SetLabel(u'抓取')
            self.button.Enable(True)

    def getCateList(self, cate):
        for s in cate:
            if type(s) == type([]):
                self.getCateList(s)
            else:
                self.cateList.append(s)


    def getCate(self, xpath=None, p=''):
        ret=[]
        if xpath == None:
            xtree = etree.parse(open('cate.xml'))
            cates = xtree.xpath('/root/cate')
            for cate in cates:
                row = cate.getchildren()
                if row :
                    ret.append(p+cate.get('name'))
                    ret.append(self.getCate(row, (p+cate.get('name')+'->')))
                else:
                    ret.append(p+cate.get('name'))
        else:
            for cate in xpath:
                row = cate.getchildren()
                if row:
                    ret.append(p+cate.get('name'))
                    ret.append(self.getCate(row,(p+cate.get('name')+'->')))
                else:
                    ret.append(p+cate.get('name'))
        return ret






 app = wx.PySimpleApp()
 frame = DemoFrame()
 frame.Show()
 app.MainLoop()
diff --git a/spiderling_index.py b/spiderling_index.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 from lxml import etree
 from os.path import join, getsize
 from model.curl import curl
 import csv
 import re
 import string
 import re

 header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
    "col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
    "col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片" )

 class main:
    def init(self, url, cate):
        self.curl = curl()

        html = self.curl.read(url)
        #fop = open('./html.html')
        #print getsize('./html.html')
        #fop.write(html)
        #try:
        #    html = fop.read(getsize('./html.html'))
        #    #html =  self.curl.mdcode(html)
        #finally:
        #    fop.close()
        #print html
        data = {}

        xtree = etree.HTML(html)

        # 标题
        title = xtree.xpath('//h1')
        data['name'] = string.strip(title[0].text)

        #价格
        price = xtree.xpath('//span[@id="ECS_SHOPPRICE"]')
        data['price'] = string.strip(price[0].text)

        #原价
        oldprice = xtree.xpath('//span[@class="xline"]')
        oldprice = re.findall(re.compile('[\d.]*'), string.strip(oldprice[0].text))
        data['oldprice'] = oldprice[1]


        #品牌
        #brand = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/div[3]/span[2]/a')
        #data['brand'] = string.strip(brand[0].text)

        #货号
        huohao = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/p/span[2]')
        data['ibn'] = string.strip(huohao[0].text)

        #大图片
        bimg = xtree.xpath('//*[@id="thumg"]')

        imgurl = string.strip(bimg[0].get('src'))
        data['bimg'] = self.curl.down(imgurl)

        #大图片
        data['simg'] = data['bimg']

        #详细
        dest = xtree.xpath('//div[@class="deszone"]/div[@class="zones"]')
        des = etree.tostring(dest[0], encoding='utf-8')
        #data['des'] = des
        reg = re.compile('\s',re.I)
        s = reg.subn(' ', des)
        data['des'] = s[0]
        data['des'] = data['des'].replace( 'src2','src')

        #print data['des']
        #下载所有图片
        #ireg = re.compile("<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*(\?<imgUrl>[^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")
        imgreg = re.compile(r"<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*([^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")

        ilist = imgreg.findall(data['des'])
        for img in ilist:
            try:
                print u'下载'+img
                new = self.curl.down(img)
                data['des'] = data['des'].replace( img,new)
            except:
                print u'下载失败'+img

        header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
                  "col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
                  "col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片")
        #print cate
        #cate = ''
        # 拼字段
        row = (self.curl.mdcode('通用商品类型'), self.curl.mdcode(data['ibn']),'',self.curl.mdcode(cate),'',self.curl.mdcode(data['oldprice']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['name']),'Y','',self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['simg']),self.curl.mdcode(data['bimg']),'',self.curl.mdcode(data['des']),'0.000','','','',self.curl.mdcode(data['bimg']),self.curl.mdcode(data['simg']))
        fop = open('tmp.csv','w+')
        writer = csv.writer(fop)
        writer.writerow(header)
        writer.writerow(row)
        print u'完成'
        fop.close()
        return True

 #mai = main()
 #mai.init()
 #curls = curl()
 #curls.down('http://www.msex.com/static/upload/1303121657296625.jpg',{})
diff --git a/spiderling_model___init__.py b/spiderling_model___init__.py
 __author__ = 'ShengYue'
diff --git a/spiderling_model_curl.py b/spiderling_model_curl.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 import urllib2
 import time
 import random
 import os.path
 import urllib
 from log import log
 class curl:
    # 链接表
    urlList = {}

    req = None

    #字符编码处理
    def mdcode(self, data):
       # code = chardet.detect(data)
        #return data.decode(code['encoding'])
        for c in ('utf-8', 'gbk', 'gb2312'):
            try:
                return data.decode(c)
            except:
                pass
 #
 #        for c in ('utf-8', 'gbk', 'gb2312'):
 #            try:
 #                return data.encode( 'utf-8' )
 #            except:
 #                pass
 #
 #        return data
    #
    def getBaseUrl(self, base_url, link):
        print ''

    def read(self,url, config={}):
        try:
            url = urllib.unquote(url)


            header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0'}
            self.req = urllib2.Request(url,headers=header)

            # 添加头信息
            for key in config:
                self.req.add_header(key, config[key])

            res = urllib2.urlopen(self.req)
            html = res.read()

            res.close()

            # code = chardet.detect(html)
            return self.mdcode(html)

        except:
            print u'获取HTML失败'
            return ''

    def getFileName(self):
        return time.strftime('%y%m%d%H%I',time.localtime(time.time()))+'-'+ str(random.randint(10,99))+'-'+str(random.randint(10,99))

    def down(self,url):

        ext = os.path.splitext(url)[-1]
        socket = urllib2.urlopen(url)
        data = socket.read()
        fileName =self.getFileName()+ext
        with open( './images/'+fileName, "wb") as jpg:
            jpg.write(data)
        socket.close()

        return '/uploads/images/'+fileName
diff --git a/spiderling_model_db.py b/spiderling_model_db.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 import MySQLdb
 import hashlib

 db_host = '127.0.0.1'
 db_name = 'root'
 db_passwd = 'LEsc2008'
 db_dbname = 'python'
 db_port = 3306

 class db:
    #self.conn = None
    def __init__(self):
        try:
            self.conn = MySQLdb.connect(host=db_host,user=db_name,passwd=db_passwd,port=db_port,use_unicode=True, charset='utf8')
            self.cur = self.conn.cursor()

            #print self.cur
            '''创建数据库 如果数据库不存在'''
            #count = self.cur.execute("create database if not exists %s", db_dbname)
            #print count
            self.conn.select_db(db_dbname)
            #self.cur.execute("SET NAMES utf8")

        except MySQLdb.Error,e:
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])

    '''
    获取网站下连接
    '''
    def get_url(self, web_name):
        self.cur.execute("SELECT * FROM links WHERE web_name = %s AND status=0", web_name)
        return self.cur.fetchone()

    '''
    持久化连接
    '''
    def add_url(self, link, web_name):
        md5 = hashlib.md5(link).hexdigest()
        print link
        self.cur.execute("INSERT INTO links(`link`, `web_name`, `md5`)VALUES(%s, %s, %s)", [link, web_name, md5])
        self.conn.commit()

    '''
    检测连接是否存在
    '''
    def check_url(self, link):
        md5 = mdb5 = hashlib.md5(link).hexdigest()
        return self.cur.execute("SELECT * FROM links WHERE `md5`=%s", md5)
        #return self.cur.fetchone()

    def update_url(self, id):
        self.cur.execute("UPDATE links SET status = 1 WHERE id=%s", id);
        self.conn.commit()
        return True

    def add_star(self, director):
        #print director
        count = self.cur.execute("SELECT id FROM star WHERE name=%s", director)
        if count == 0:
            self.cur.execute("INSERT INTO star(name)VALUES(%s)",director)

            id = self.conn.insert_id()
            self.conn.commit()
            return str(id)
        else:
            star = self.cur.fetchone()
            return str(star[0])

    def addData(self,data):
        #print data
        ### 增加导演
        director = ''
        try:
            for daoyan in data['director']:
                director += ','+self.add_star(daoyan)
            director = director.strip(',')
        except:
            director = ''

        ### 增加主演
        leading = ''
        try:
            for lead in data['leading']:
                leading += ','+self.add_star(lead)
            leading = leading.strip(',')
        except:
            leading = ''

        ### 简介
        comment = ''
        try:
            for comm in data['comment']:
                comment += comm
        except:
            comment = '';
        #标题， 图片, 链接
        inserData = [data['title'], data['detail_pic'],data['url'],director,leading,data['area'],data['show_day'],comment]
        self.add_movie(inserData)

    def add_movie(self,insertData):
        self.cur.execute("INSERT INTO movie(`title`,`img`,`url`,`director`,`leading`,`area`,`show_day`,`comment`)VALUES(%s, %s, %s, %s, %s, %s, %s, %s)", insertData)
        self.conn.commit()

    '''
    关闭数据库
    '''
    def close(self):
        try:
            self.cur.close()
            self.conn.close()
        except:
            pass
diff --git a/spiderling_model_log.py b/spiderling_model_log.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 import logging


 class log:
    @staticmethod
    def read(file):
        try:
            fopen = open(file, 'r')
            data = fopen.read()
            fopen.close()
            return data
        except:
            pass

    @staticmethod
    def write(file, logs):
        try:
            logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%a, %d %b %Y %H:%M:%S', filename=file,filemode='w')
            logging.info(logs)
        except Exception, e:
            print '-----------', Exception, e
            pass
diff --git a/spiderling_model_match.py b/spiderling_model_match.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
 import re
 class match:
    '''
    修复HTML
    创建XPATH对象
    '''
    def __init__(self, html, url):
        cleaner = Cleaner(style=True, scripts=True,page_structure=False, safe_attrs_only=False)
        html = cleaner.clean_html(html)
        del cleaner

        self.etree = lxml.html.fromstring(html)
        self.etree.make_links_absolute( base_url=url, resolve_base_href=True)

    '''
    获取所有可以匹配链接
    '''
    def get_all_links(self, link_match, url):
        links = []
        all_links = self.etree.xpath('//a')

        for match in link_match:
            regLink = re.compile(url+match.get('value'))
            for a in all_links:
                try:
                    href = a.get('href')
                except:
                    continue;
                if regLink.match(href) != None:
                    links.append(href)
                #else:
                    #print '失败', a.get('href')
        del all_links
        return links

    '''
    获取所有需要查询的信息
    '''
    def get_match_info(self, match, url=None):

        try:
            data = {}
            for param in match:
                name = param.get('name')
                ntree =  lxml.html.fromstring(lxml.etree.tostring(param))
                #
                node = ntree.xpath('//parsers/parser')[0]
                xpath = node.get('xpath')

                infoxpath = self.etree.xpath(xpath)
                try:
                    nodetype = node.get('type')

                    if nodetype == 'text':
                        data[name] = infoxpath[0].strip()

                    elif nodetype == 'array':
                        arr = []
                        for item in infoxpath:
                            if item.strip() == '':
                                continue;
                            arr.append(item.strip())
                        data[name] = arr

                    elif nodetype == 'pageurl':
                        data[name] = url

                    elif nodetype == 'html':
                        infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
                        infohtml = infohtml.strip()
                        reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b))+\b\s*[^>]*>|[\s\r\n\t]+')
                        infohtml = reg.sub(' ',infohtml).strip()
                        data[name] = infohtml

                except:
                    data[name] = ''
                    print name,u'读取不出来'
                    continue
        except:
            print xpath,u'读取不出来'

        return data

    def match_tiantang(self, match, url):
        try:
            data = {}
            for param in match:
                name = param.get('name')
                ntree =  lxml.html.fromstring(lxml.etree.tostring(param))
                #
                node = ntree.xpath('//parsers/parser')[0]
                xpath = node.get('xpath')

                infoxpath = self.etree.xpath(xpath)

                try:
                    nodetype = node.get('type')
                    if nodetype == 'text':
                        data[name] = infoxpath[0].strip()
                    elif nodetype == 'array':
                        arr = []
                        for item in infoxpath:
                            if item.strip() == '':
                                continue;
                            arr.append(item.strip())
                        data[name] = arr

                    elif nodetype == 'pageurl':
                        data[name] = url

                    elif nodetype == 'html':
                        infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
                        infohtml = infohtml.strip()
                        reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
                        infohtml = reg.sub(' ',infohtml).strip()
                        data[name] = infohtml
                except:
                    data[name] = ''
                    print name,u'读取不出来'
                    continue
        except:
            #log.write('system.log',xpath+u'读取不出来')
            print xpath,u'读取不出来'

    '''
   获取所有需要查询的信息
   '''
    def get_match_info_test(self, match, url=None):

        try:
            data = {}
            for param in match:
                name = param.get('name')
                ntree =  lxml.html.fromstring(lxml.etree.tostring(param))
                #
                node = ntree.xpath('//parsers/parser')[0]
                xpath = node.get('xpath')

                infoxpath = self.etree.xpath(xpath)
                try:
                    nodetype = node.get('type')
                    if nodetype == 'text':
                        if infoxpath != []:
                            data[name] = infoxpath[0].strip()
                        else:
                            data[name] = ''
                    elif nodetype == 'array':
                        arr = []
                        if infoxpath == []:
                            data[name] = arr
                        else:
                            for item in infoxpath:
                                if item.strip() == '':
                                    continue;
                                arr.append(item.strip())
                            data[name] = arr

                    elif nodetype == 'pageurl':
                        data[name] = url

                    elif nodetype == 'html':
                        if infoxpath == []:
                            data[name] = ''
                        else:
                            infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
                            infohtml = infohtml.strip()
                            reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
                            infohtml = reg.sub(' ',infohtml).strip()
                            data[name] = infohtml
                except:
                    data[name] = ''
                    print name,u'读取不出来'
                    continue
        except:
            #log.write('system.log',xpath+u'读取不出来')
            print xpath,u'读取不出来'

        return data

 #        print self.etree.xpath('//h1/text()')[0]
 #        print self.etree.xpath('//h1/em/text()')[0]
 #        com = self.etree.xpath("//div[@class='filmcontents']/node()/text()|//div[@class='filmcontents']/text()")
 #        s = ''
 #        for c in com:
 #            s = s+ c
 #        print s
 #        #规则学习
 #        d = self.etree.xpath(u"//div[@class='detail_intro']/table/tr/td[text()='上映日期：']/../td[last()]/text()")
 #        print d[0]
    def close(self):
        del self.etree


diff --git a/spiderling_myapp.log b/spiderling_myapp.log
 Mon, 08 Apr 2013 22:17:20 log.py[line:25] INFO 网站读取完成
diff --git a/spiderling_setup.py b/spiderling_setup.py
 # mysetup.py
 from distutils.core import setup
 import py2exe
 setup(options = {"py2exe":{"dll_excludes":["MSVCP90.dll",'lxml.dll'], }},windows=[{"script": "frame.py"}])
diff --git a/spiderling_spiderling.py b/spiderling_spiderling.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 from lxml import etree
 from model.db import db
 from model.curl import curl
 from model.match import match
 from model.log import log
 import re
 import time
 i = 1
 '''
 爬虫
 '''
 class spiderling:

    def __init__(self, config):
        self.i = 0
        try:
            configtree = etree.ElementTree(file=config)

            # 获取网站属性
            sites = configtree.xpath('//site')
            site = sites[0]
            self.url = site.get('url')


            self.site_name = site.get('siteName')
            self.daily = float(site.get('daily'))
            self.log = site.get('log')
            self.errlog = site.get('error')

            self.linkRule = configtree.xpath('//linkRules/rule')
            self.infoUrlRule = configtree.xpath('//urlRules/rule')
            self.infoRule = configtree.xpath('//targets/target/model/field')

        except:
            log.write('error.log', u'配置文件读取错误')

        self.db = db()

    def run(self, url):
        #休息时间
        time.sleep(self.daily)

        if url == None:
            info = self.db.get_url(self.site_name)

            if info == None:
                log.write(self.log, u'网站读取完成')
                return 0;

            self.db.update_url(info[0])
            url = info[1]

        gurl = curl()
        html = gurl.read(url)
        try:
            if html.strip() == '':
                s = None;
                self.run(s)

        except Exception, e:
            log.write(self.log, url+u' html 获取失败'+Exception+e)
            s = None;
            self.run(s)

        #print html
        self.xtree = match(html, url)
        links = self.xtree.get_all_links(self.linkRule, self.url)

        '''把获取到的连接持久化'''
        for link in links:

            if self.db.check_url(link) == 0:
                self.db.add_url(link, self.site_name)

        '''如果当前连接是详细页则正则所需内容'''
        #for infoxpath in self.infoRule:
        #self.xtree.get_match_info(self.infoRule)

        regInfoLink = re.compile(self.infoUrlRule[0].get('value'))

        if regInfoLink.match(url) <> None:
            self.i = self.i+1
            data = self.xtree.get_match_info(self.infoRule, url)

            self.db.addData(data)
 #
 #            file_object = open(str(self.i)+'id.txt', 'w')
 #            file_object.write(json.dumps(data))
 #            file_object.close()
 #
            #print json.dumps(data)
        else:
            print u'不是详细也不需要解析'
        s = None
        self.run(s)

    def close(self):
        try:
            self.xtree.close()
        except:
            pass
        try:
            self.db.close()
        except:
            pass



 sp = spiderling('cate.xml')

 #return
 sp.run(sp.url)
 #sp.run('http://www.ffdy.cc/movie/35622.html')
 sp.close()

 #import sqlite3 #导入模块
 #cx = sqlite3.connect("d:\\test.db")
 #
 #cu=cx.cursor()
 ##cu.execute("""create table catalog ( id integer primary key, pid integer, name varchar(10) UNIQUE )""")
 ##
 ##cu.execute(u"insert into catalog values(2, 0, '哈哈')")
 ##cu.execute(u"insert into catalog values(3, 0, '我是中国')")
 ##cx.commit()
 #
 #cu.execute("select * from catalog")
 #d =  cu.fetchall()
 #for s in d:
 #    print s[2]
 #cu.close()
 #cx.close()
diff --git a/spiderling_test.py b/spiderling_test.py
 # -*- coding: utf-8 -*-
 __author__ = 'ShengYue'
 from lxml import etree
 from model.db import db
 from model.curl import curl
 from model.match import match
 import re
 import time
 import lxml
 i = 1
 '''
 爬虫
 '''
 class spiderling:

    def __init__(self, config):
        self.i = 0
        configtree = etree.ElementTree(file=config)

        site = configtree.xpath('//site')
        self.url = site[0].get('url')
        self.site_name = site[0].get('siteName')

        self.linkRule = configtree.xpath('//linkRules/rule')
        self.infoUrlRule = configtree.xpath('//urlRules/rule')
        self.infoRule = configtree.xpath('//targets/target/model/field')


        #print self.linkRule[0].get('value')
        self.db = db()

    def run(self, url):

        time.sleep(0.3)
        if url == None:
            info = self.db.get_url(self.site_name)

            if info == None:
                print u'爬虫完成'
                return 0;

            self.db.update_url(info[0])
            url = info[1]

        gurl = curl()
        html = gurl.read(url)

        try:
            if html.strip() == '':
                s = None;
                self.run(s)

        except:
            s = None;
            self.run(s)



        #print html
        self.xtree = match(html, url)
        d = self.xtree.etree.xpath("//div[@class='filmcontents']")
        sd = etree.tostring(d[0],encoding="utf-8",method="html")
        sd = sd.strip()
        print sd
        print '================================='
        reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
        ds = reg.sub(' ',sd).strip()
        print ds
        return
        links = self.xtree.get_all_links(self.linkRule, self.url)

        '''把获取到的连接持久化'''
        for link in links:

            if self.db.check_url(link) == 0:
                self.db.add_url(link, self.site_name)

        '''如果当前连接是详细页则正则所需内容'''
        #for infoxpath in self.infoRule:
        #self.xtree.get_match_info(self.infoRule)

        regInfoLink = re.compile(self.infoUrlRule[0].get('value'))

        if regInfoLink.match(url) <> None:
            self.i = self.i+1

            print u'是详细页需要解析', str(self.i)
            data = self.xtree.get_match_info_test(self.infoRule, url)
            print u'插入数据', url
            self.db.addData(data)
        #
        #            file_object = open(str(self.i)+'id.txt', 'w')
        #            file_object.write(json.dumps(data))
        #            file_object.close()
        #
        #print json.dumps(data)
        else:
            print u'不是详细也不需要解析'
        s = None
        self.run(s)

    def close(self):
        self.xtree.close()
        self.db.close()



 #sp = spiderling('cate.xml')
 #sp.run(sp.url)
 #sp.run('http://www.ffdy.cc/movie/35622.html')
 #sp.close()

 url = 'http://www.dytt8.net/html/gndy/dyzz/20130407/41866.html'
 curls = curl()
 html = curls.read(url,{})
 xtree = match(html, url)
 content = xtree.etree.xpath('//div[@id="Zoom"]')

 infohtml = lxml.etree.tostring(content[0],encoding="utf-8",method="html")
 infohtml = infohtml.strip()
 reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b|\ba\b))+\b\s*[^>]*>')
 infohtml = reg.sub(' ',infohtml).strip()

 pattern = re.compile(r'◎年　　代　([^<]*)')
 ds= pattern.search(html)
 print
 if(ds==[]):
    print u'找不到'
 else:
    print ds[0]

 print infohtml
diff --git a/spiderling_tiantang.xml b/spiderling_tiantang.xml
 <?xml version="1.0" encoding="utf-8"?>
 <root>
  <site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
      <linkRules>
          <rule type="reg" value="(type/movie|movie)" />
      </linkRules>
      <targets>
          <target name="info">
              <urlRules>
                  <rule type="reg" value=".*/movie/(\d+).html" />
              </urlRules>
              <model dataType="array">
 					<field name="title">
 						<parsers>
 							<parser type="text" xpath="//h1/text()" />
 						</parsers>
 					</field>
 					<field name="url">
 						<parsers>
 							<parser type="pageurl" xpath="//h1/text()" />
 						</parsers>
 					</field>
 					<field name="detail_pic">
 						<parsers>
 							<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
 						</parsers>
 					</field>
                  <field name="director">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
                  <field name="leading">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="type">
                      <parsers>
                          <parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="area">
                      <parsers>
                          <parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区：']/../td[last()]/a/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="show_day">
                      <parsers>
                          <parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期：']/../td[last()]/text()" code="u" />
                      </parsers>
                  </field>
 				  <field name="comment">
                      <parsers>
                          <parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
                      </parsers>
                  </field>
              </model>
          </target>
      </targets>
  </site>
 </root>
	<?xml version="1.0" encoding="utf-8"?>
	<root>
	<site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
	<linkRules>
	<rule type="reg" value="(type/movie\|movie)" />
	</linkRules>
	<targets>
	<target name="info">
	<urlRules>
	<rule type="reg" value=".*/movie/(\d+).html" />
	</urlRules>
	<model dataType="array">
	<field name="title">
	<parsers>
	<parser type="text" xpath="//h1/text()" />
	</parsers>
	</field>
	<field name="url">
	<parsers>
	<parser type="pageurl" xpath="//h1/text()" />
	</parsers>
	</field>
	<field name="detail_pic">
	<parsers>
	<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
	</parsers>
	</field>
	<field name="director">
	<parsers>
	<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演：']/../td[last()]/a/text()" code="u" />
	</parsers>
	</field>
	<field name="leading">
	<parsers>
	<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演：']/../td[last()]/a/text()" code="u" />
	</parsers>
	</field>
	<field name="type">
	<parsers>
	<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型：']/../td[last()]/a/text()" code="u" />
	</parsers>
	</field>
	<field name="area">
	<parsers>
	<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区：']/../td[last()]/a/text()" code="u" />
	</parsers>
	</field>
	<field name="show_day">
	<parsers>
	<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期：']/../td[last()]/text()" code="u" />
	</parsers>
	</field>
	<field name="comment">
	<parsers>
	<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
	</parsers>
	</field>
	</model>
	</target>
	</targets>
	</site>
	</root>
	# -- coding: utf-8 --
	__author__ = 'ShengYue'
	import wx
	from lxml import etree
	from index import main


	class DemoFrame(wx.Frame):
	def __init__(self):
	self.cateList = []
	wx.Frame.__init__(self, None, -1, u"load goods",size=(400,200))


	self.draw()

	def draw(self):
	self.panel = wx.Panel(self, -1)
	wx.StaticText(self.panel, -1, u"输入网址:", (15, 15))
	wx.StaticText(self.panel, -1, u"选择分类:", (15, 50))
	sampleList = self.getCate()
	self.getCateList(sampleList)

	self.cate = wx.ComboBox(self.panel, -1, self.cateList[0], (80, 50), wx.DefaultSize, self.cateList)

	self.text = wx.TextCtrl(self.panel,-1,value='',pos=(80,15),size=(300,24))

	self.button = wx.Button(self.panel, -1, u"抓取", pos=(15, 90))
	self.Bind(wx.EVT_BUTTON, self.OnClick, self.button)

	def OnClick(self, event):
	self.button.SetLabel(u'抓取中...')
	self.button.Enable(False)

	index = main()
	bool = index.init(self.text.GetValue(),self.cate.GetValue())
	if bool:
	self.button.SetLabel(u'抓取')
	self.button.Enable(True)

	def getCateList(self, cate):
	for s in cate:
	if type(s) == type([]):
	self.getCateList(s)
	else:
	self.cateList.append(s)


	def getCate(self, xpath=None, p=''):
	ret=[]
	if xpath == None:
	xtree = etree.parse(open('cate.xml'))
	cates = xtree.xpath('/root/cate')
	for cate in cates:
	row = cate.getchildren()
	if row :
	ret.append(p+cate.get('name'))
	ret.append(self.getCate(row, (p+cate.get('name')+'->')))
	else:
	ret.append(p+cate.get('name'))
	else:
	for cate in xpath:
	row = cate.getchildren()
	if row:
	ret.append(p+cate.get('name'))
	ret.append(self.getCate(row,(p+cate.get('name')+'->')))
	else:
	ret.append(p+cate.get('name'))
	return ret






	app = wx.PySimpleApp()
	frame = DemoFrame()
	frame.Show()
	app.MainLoop()
	# -- coding: utf-8 --
	__author__ = 'ShengYue'
	import urllib2
	import time
	import random
	import os.path
	import urllib
	from log import log
	class curl:
	# 链接表
	urlList = {}

	req = None

	#字符编码处理
	def mdcode(self, data):
	# code = chardet.detect(data)
	#return data.decode(code['encoding'])
	for c in ('utf-8', 'gbk', 'gb2312'):
	try:
	return data.decode(c)
	except:
	pass
	#
	# for c in ('utf-8', 'gbk', 'gb2312'):
	# try:
	# return data.encode( 'utf-8' )
	# except:
	# pass
	#
	# return data
	#
	def getBaseUrl(self, base_url, link):
	print ''

	def read(self,url, config={}):
	try:
	url = urllib.unquote(url)


	header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0'}
	self.req = urllib2.Request(url,headers=header)

	# 添加头信息
	for key in config:
	self.req.add_header(key, config[key])

	res = urllib2.urlopen(self.req)
	html = res.read()

	res.close()

	# code = chardet.detect(html)
	return self.mdcode(html)

	except:
	print u'获取HTML失败'
	return ''

	def getFileName(self):
	return time.strftime('%y%m%d%H%I',time.localtime(time.time()))+'-'+ str(random.randint(10,99))+'-'+str(random.randint(10,99))

	def down(self,url):

	ext = os.path.splitext(url)[-1]
	socket = urllib2.urlopen(url)
	data = socket.read()
	fileName =self.getFileName()+ext
	with open( './images/'+fileName, "wb") as jpg:
	jpg.write(data)
	socket.close()

	return '/uploads/images/'+fileName
	# -- coding: utf-8 --
	__author__ = 'ShengYue'
	import MySQLdb
	import hashlib

	db_host = '127.0.0.1'
	db_name = 'root'
	db_passwd = 'LEsc2008'
	db_dbname = 'python'
	db_port = 3306

	class db:
	#self.conn = None
	def __init__(self):
	try:
	self.conn = MySQLdb.connect(host=db_host,user=db_name,passwd=db_passwd,port=db_port,use_unicode=True, charset='utf8')
	self.cur = self.conn.cursor()

	#print self.cur
	'''创建数据库如果数据库不存在'''
	#count = self.cur.execute("create database if not exists %s", db_dbname)
	#print count
	self.conn.select_db(db_dbname)
	#self.cur.execute("SET NAMES utf8")

	except MySQLdb.Error,e:
	print "Mysql Error %d: %s" % (e.args[0], e.args[1])

	'''
	获取网站下连接
	'''
	def get_url(self, web_name):
	self.cur.execute("SELECT * FROM links WHERE web_name = %s AND status=0", web_name)
	return self.cur.fetchone()

	'''
	持久化连接
	'''
	def add_url(self, link, web_name):
	md5 = hashlib.md5(link).hexdigest()
	print link
	self.cur.execute("INSERT INTO links(`link`, `web_name`, `md5`)VALUES(%s, %s, %s)", [link, web_name, md5])
	self.conn.commit()

	'''
	检测连接是否存在
	'''
	def check_url(self, link):
	md5 = mdb5 = hashlib.md5(link).hexdigest()
	return self.cur.execute("SELECT * FROM links WHERE `md5`=%s", md5)
	#return self.cur.fetchone()

	def update_url(self, id):
	self.cur.execute("UPDATE links SET status = 1 WHERE id=%s", id);
	self.conn.commit()
	return True

	def add_star(self, director):
	#print director
	count = self.cur.execute("SELECT id FROM star WHERE name=%s", director)
	if count == 0:
	self.cur.execute("INSERT INTO star(name)VALUES(%s)",director)

	id = self.conn.insert_id()
	self.conn.commit()
	return str(id)
	else:
	star = self.cur.fetchone()
	return str(star[0])

	def addData(self,data):
	#print data
	### 增加导演
	director = ''
	try:
	for daoyan in data['director']:
	director += ','+self.add_star(daoyan)
	director = director.strip(',')
	except:
	director = ''

	### 增加主演
	leading = ''
	try:
	for lead in data['leading']:
	leading += ','+self.add_star(lead)
	leading = leading.strip(',')
	except:
	leading = ''

	### 简介
	comment = ''
	try:
	for comm in data['comment']:
	comment += comm
	except:
	comment = '';
	#标题，图片, 链接
	inserData = [data['title'], data['detail_pic'],data['url'],director,leading,data['area'],data['show_day'],comment]
	self.add_movie(inserData)

	def add_movie(self,insertData):
	self.cur.execute("INSERT INTO movie(`title`,`img`,`url`,`director`,`leading`,`area`,`show_day`,`comment`)VALUES(%s, %s, %s, %s, %s, %s, %s, %s)", insertData)
	self.conn.commit()

	'''
	关闭数据库
	'''
	def close(self):
	try:
	self.cur.close()
	self.conn.close()
	except:
	pass
	# -- coding: utf-8 --
	__author__ = 'ShengYue'
	import logging


	class log:
	@staticmethod
	def read(file):
	try:
	fopen = open(file, 'r')
	data = fopen.read()
	fopen.close()
	return data
	except:
	pass

	@staticmethod
	def write(file, logs):
	try:
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%a, %d %b %Y %H:%M:%S', filename=file,filemode='w')
	logging.info(logs)
	except Exception, e:
	print '-----------', Exception, e
	pass
	# -- coding: utf-8 --
	__author__ = 'ShengYue'
	import lxml
	import lxml.etree
	from lxml.html.clean import Cleaner
	import re
	class match:
	'''
	修复HTML
	创建XPATH对象
	'''
	def __init__(self, html, url):
	cleaner = Cleaner(style=True, scripts=True,page_structure=False, safe_attrs_only=False)
	html = cleaner.clean_html(html)
	del cleaner

	self.etree = lxml.html.fromstring(html)
	self.etree.make_links_absolute( base_url=url, resolve_base_href=True)

	'''
	获取所有可以匹配链接
	'''
	def get_all_links(self, link_match, url):
	links = []
	all_links = self.etree.xpath('//a')

	for match in link_match:
	regLink = re.compile(url+match.get('value'))
	for a in all_links:
	try:
	href = a.get('href')
	except:
	continue;
	if regLink.match(href) != None:
	links.append(href)
	#else:
	#print '失败', a.get('href')
	del all_links
	return links

	'''
	获取所有需要查询的信息
	'''
	def get_match_info(self, match, url=None):

	try:
	data = {}
	for param in match:
	name = param.get('name')
	ntree = lxml.html.fromstring(lxml.etree.tostring(param))
	#
	node = ntree.xpath('//parsers/parser')[0]
	xpath = node.get('xpath')

	infoxpath = self.etree.xpath(xpath)
	try:
	nodetype = node.get('type')

	if nodetype == 'text':
	data[name] = infoxpath[0].strip()

	elif nodetype == 'array':
	arr = []
	for item in infoxpath:
	if item.strip() == '':
	continue;
	arr.append(item.strip())
	data[name] = arr

	elif nodetype == 'pageurl':
	data[name] = url

	elif nodetype == 'html':
	infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
	infohtml = infohtml.strip()
	reg = re.compile(r'<[!/]?\b(?!(\bpre\b\|\bli\b\|\bp\b\|\bbr\b\|\bspan\b\|\bimg\b))+\b\s[^>]>\|[\s\r\n\t]+')
	infohtml = reg.sub(' ',infohtml).strip()
	data[name] = infohtml

	except:
	data[name] = ''
	print name,u'读取不出来'
	continue
	except:
	print xpath,u'读取不出来'

	return data

	def match_tiantang(self, match, url):
	try:
	data = {}
	for param in match:
	name = param.get('name')
	ntree = lxml.html.fromstring(lxml.etree.tostring(param))
	#
	node = ntree.xpath('//parsers/parser')[0]
	xpath = node.get('xpath')

	infoxpath = self.etree.xpath(xpath)

	try:
	nodetype = node.get('type')
	if nodetype == 'text':
	data[name] = infoxpath[0].strip()
	elif nodetype == 'array':
	arr = []
	for item in infoxpath:
	if item.strip() == '':
	continue;
	arr.append(item.strip())
	data[name] = arr

	elif nodetype == 'pageurl':
	data[name] = url

	elif nodetype == 'html':
	infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
	infohtml = infohtml.strip()
	reg = re.compile(r'<[!/]?\b(?!(\bpre\b\|\bli\b\|\bp\b\|\bbr\b))+\b\s[^>]>\|[\s\r\n\t]+')
	infohtml = reg.sub(' ',infohtml).strip()
	data[name] = infohtml
	except:
	data[name] = ''
	print name,u'读取不出来'
	continue
	except:
	#log.write('system.log',xpath+u'读取不出来')
	print xpath,u'读取不出来'

	'''
	获取所有需要查询的信息
	'''
	def get_match_info_test(self, match, url=None):

	try:
	data = {}
	for param in match:
	name = param.get('name')
	ntree = lxml.html.fromstring(lxml.etree.tostring(param))
	#
	node = ntree.xpath('//parsers/parser')[0]
	xpath = node.get('xpath')

	infoxpath = self.etree.xpath(xpath)
	try:
	nodetype = node.get('type')
	if nodetype == 'text':
	if infoxpath != []:
	data[name] = infoxpath[0].strip()
	else:
	data[name] = ''
	elif nodetype == 'array':
	arr = []
	if infoxpath == []:
	data[name] = arr
	else:
	for item in infoxpath:
	if item.strip() == '':
	continue;
	arr.append(item.strip())
	data[name] = arr

	elif nodetype == 'pageurl':
	data[name] = url

	elif nodetype == 'html':
	if infoxpath == []:
	data[name] = ''
	else:
	infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
	infohtml = infohtml.strip()
	reg = re.compile(r'<[!/]?\b(?!(\bpre\b\|\bli\b\|\bp\b\|\bbr\b))+\b\s[^>]>\|[\s\r\n\t]+')
	infohtml = reg.sub(' ',infohtml).strip()
	data[name] = infohtml
	except:
	data[name] = ''
	print name,u'读取不出来'
	continue
	except:
	#log.write('system.log',xpath+u'读取不出来')
	print xpath,u'读取不出来'

	return data

	# print self.etree.xpath('//h1/text()')[0]
	# print self.etree.xpath('//h1/em/text()')[0]
	# com = self.etree.xpath("//div[@class='filmcontents']/node()/text()\|//div[@class='filmcontents']/text()")
	# s = ''
	# for c in com:
	# s = s+ c
	# print s
	# #规则学习
	# d = self.etree.xpath(u"//div[@class='detail_intro']/table/tr/td[text()='上映日期：']/../td[last()]/text()")
	# print d[0]
	def close(self):
	del self.etree
	# mysetup.py
	from distutils.core import setup
	import py2exe
	setup(options = {"py2exe":{"dll_excludes":["MSVCP90.dll",'lxml.dll'], }},windows=[{"script": "frame.py"}])