change thread_id in the code and run via scrapy runspider -L INFO crawler.py
- change the filebase and "dump" cell (see
# write your own query here) - run all cells
| #!scrapy runspider -L INFO | |
| import scrapy | |
| import logging | |
| import html2text | |
| import urllib.parse | |
| import json | |
| import time | |
| class BlogSpider(scrapy.Spider): | |
| thread_id = 2501134746 | |
| name = 'blogspider' | |
| start_urls = ['https://tieba.baidu.com/p/%d' % thread_id] | |
| custom_settings = { | |
| 'DOWNLOAD_DELAY': 2, | |
| 'FEED_URI': 'thread_%d.json'%thread_id, | |
| 'DEPTH_PRIORITY': 1, | |
| 'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue', | |
| 'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue', | |
| 'FEED_FORMAT': 'jsonlines', | |
| 'FEED_EXPORT_ENCODING': 'utf-8', | |
| } | |
| h = html2text.HTML2Text() | |
| h.ignore_links = True | |
| h.ignore_images = True | |
| def parse(self, response): | |
| reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0]) | |
| logging.info("max page number: %d" % reply_page_num) | |
| url_parts = urllib.parse.urlparse(response.url) | |
| for i in range(reply_page_num, 0, -1): | |
| url_query = urllib.parse.parse_qs(url_parts.query) | |
| url_query.update({'pn': i}) | |
| url_parts_dict = url_parts._asdict() | |
| url_parts_dict['query'] = urllib.parse.urlencode(url_query) | |
| request_url = urllib.parse.urlunparse(url_parts_dict.values()) | |
| yield scrapy.Request(request_url, callback=self.parse_page) | |
| def parse_page(self, response): | |
| if response.status != 200: | |
| logging.warning("crawled %d '%s'" % (response.status, response.url)) | |
| page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip()) | |
| posts = response.css('#j_p_postlist > div.l_post') | |
| for post in posts: | |
| data = json.loads(post.xpath('@data-field').extract_first()) | |
| # author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip() | |
| level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip()) | |
| # order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1) | |
| # lzl = posts.css('div.core_reply_wrapper li.lzl_single_post') | |
| data['author']['level'] = level | |
| data['timestamp'] = time.time() | |
| data['page_num'] = page_num | |
| yield data |