Skip to content

Instantly share code, notes, and snippets.

@Dr-ZeeD
Created October 23, 2016 22:11
Show Gist options
  • Select an option

  • Save Dr-ZeeD/be6cf5975ecf5d1e6044351c93686ba6 to your computer and use it in GitHub Desktop.

Select an option

Save Dr-ZeeD/be6cf5975ecf5d1e6044351c93686ba6 to your computer and use it in GitHub Desktop.
Scrapy one-file script with custom logging and click.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import click
from twisted.internet import reactor
from scrapy import Item, Spider
from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.log import configure_logging
class MyItem(Item):
"""The Item"""
class MySpider(Spider):
"""The Spider"""
class MyPipeline:
"""The Pipeline"""
logger = logging.getLogger('my_pipeline')
def process_items(self, item, spider):
"""Process the item provided by spider."""
return item
def setup_loggers(loggers, log_level, logging_handlers):
for logger in loggers:
logger.setLevel(log_level)
for logging_handler in logging_handlers:
logger.addHandler(logging_handler)
@click.command()
@click.option('-v', '--verbose', count=True, help='Log verbose.')
def main(verbose):
"""Run my spider."""
configure_logging(install_root_handler=False)
if verbose >= 2:
module_log_level = logging.DEBUG
spider_log_level = logging.DEBUG
pipeline_log_level = logging.DEBUG
elif verbose == 1:
module_log_level = logging.INFO
spider_log_level = logging.INFO
pipeline_log_level = logging.INFO
else:
module_log_level = logging.WARNING
spider_log_level = logging.INFO
pipeline_log_level = logging.INFO
logging_handlers = [
logging.StreamHandler()
]
formatter = logging.Formatter(
'[%(asctime)s] [%(levelname)-8s] %(name)s - %(message)s',
datefmt='%H:%M:%S')
for handler in handlers:
handler.setFormatter(formatter)
module_loggers = [
logging.getLogger('scrapy'),
logging.getLogger('twisted')
]
setup_loggers(module_loggers, module_log_level, logging_handlers)
spider_loggers = [
MySpider.logger
]
setup_loggers(spider_loggers, spider_log_level, logging_handlers)
pipeline_loggers = [
MyPipeline.logger
]
setup_loggers(pipeline_loggers, pipeline_log_level, logging_handlers)
settings = Settings({
'ITEM_PIPELINES': {
'__main__.MyPipeline': 1,
}
})
runner = CrawlerRunner(settings)
runner.crawl(MySpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment