defexecute(argv=None, settings=None): if argv isNone: argv = sys.argv
if settings isNone: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings)
defrun(self, args, opts): iflen(args) < 1: raise UsageError() eliflen(args) > 1: raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported") spname = args[0]
classCrawlerProcess(CrawlerRunner): """ A class to run multiple scrapy crawlers in a process simultaneously. This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support for starting a Twisted `reactor`_ and handling shutdown signals, like the keyboard interrupt command Ctrl-C. It also configures top-level logging. """
def_signal_shutdown(self, signum, _): install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] logger.info("Received %(signame)s, shutting down gracefully. Send again to force ", {'signame': signame}) reactor.callFromThread(self._graceful_stop_reactor)
defstart(self, stop_after_crawl=True): """ This method starts a Twisted `reactor`_, adjusts its pool size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`. If `stop_after_crawl` is True, the reactor will be stopped after all crawlers have finished, using :meth:`join`. :param boolean stop_after_crawl: stop or not the reactor when all crawlers have finished """ if stop_after_crawl: d = self.join() # Don't start the reactor if the deferreds are already fired if d.called: return d.addBoth(self._stop_reactor)
classCrawlerRunner(object): """ This is a convenient helper class that keeps track of, manages and runs crawlers inside an already setup Twisted `reactor`_. """
crawlers = property( lambda self: self._crawlers, doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by " ":meth:`crawl` and managed by this class." )
@property defspiders(self): warnings.warn("CrawlerRunner.spiders attribute is renamed to " "CrawlerRunner.spider_loader.", category=ScrapyDeprecationWarning, stacklevel=2) return self.spider_loader
defcrawl(self, crawler_or_spidercls, *args, **kwargs): """ Run a crawler with the provided arguments. It will call the given Crawler's :meth:`~Crawler.crawl` method, while keeping track of it so it can be stopped later. If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler` instance, this method will try to create one using this parameter as the spider class given to it. Returns a deferred that is fired when the crawling is finished. :param crawler_or_spidercls: already created crawler, or a spider class or spider's name inside the project to create it :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance, :class:`~scrapy.spiders.Spider` subclass or string :param list args: arguments to initialize the spider :param dict kwargs: keyword arguments to initialize the spider """ crawler = self.create_crawler(crawler_or_spidercls) return self._crawl(crawler, *args, **kwargs)
def_crawl(self, crawler, *args, **kwargs): self.crawlers.add(crawler) d = crawler.crawl(*args, **kwargs) self._active.add(d)
def_done(result): self.crawlers.discard(crawler) self._active.discard(d) return result
return d.addBoth(_done)
defcreate_crawler(self, crawler_or_spidercls): """ Return a :class:`~scrapy.crawler.Crawler` object. * If `crawler_or_spidercls` is a Crawler, it is returned as-is. * If `crawler_or_spidercls` is a Spider subclass, a new Crawler is constructed for it. * If `crawler_or_spidercls` is a string, this function finds a spider with this name in a Scrapy project (using spider loader), then creates a Crawler instance for it. """ ifisinstance(crawler_or_spidercls, Crawler): return crawler_or_spidercls return self._create_crawler(crawler_or_spidercls)
defstop(self): """ Stops simultaneously all the crawling jobs taking place. Returns a deferred that is fired when they all have ended. """ return defer.DeferredList([c.stop() for c inlist(self.crawlers)])
@defer.inlineCallbacks defjoin(self): """ join() Returns a deferred that is fired when all managed :attr:`crawlers` have completed their executions. """ while self._active: yield defer.DeferredList(self._active)