我的第一个蜘蛛(即 s1 )获得了 id 的列表,即 . [1,2,3,4,5] . 在我的 s1 的自定义扩展中,我将列表与现有列表(即 self.old_products = [1,2,3,6,7] )进行比较,并找到差异,即 . diff . 然后我将 diff 传递给我的第二个蜘蛛(即 s2 ),根据 diff 中的值进行处理 . 但是我无法做出第二次蜘蛛产量请求 . 如果我在spider2的 start_requests 方法中注释 yield request ,它可以正确 print(item_id) . 谢谢 .

我的蜘蛛:

import scrapy
from missing.items import MissingItem

class S1Spider(scrapy.Spider):
    name = 's1'

    custom_settings = {
        'EXTENSIONS': {
            'missing.test_ext.ext1': 500,
        }
    }

    def start_requests(self):
        item_ids = [1,2,3,4,5]
        for item_id in item_ids:
            url = 'http://quotes.toscrape.com/page/{}/'.format(item_id)
            request = scrapy.Request(url, callback=self.parse)
            request.meta['item_id'] = item_id

            yield request


    def parse(self, response):
        item = MissingItem()
        item['item_id'] = response.meta['item_id']
        yield item

class S2Spider(scrapy.Spider):
    name = 's2'

    # custom_settings = {
    #     'EXTENSIONS': {
    #         'missing.test_ext.ext2': None,
    #     }
    # }

    def __init__(self, item_ids=None, *args, **kwargs):
        super(S2Spider, self).__init__(*args, **kwargs)
        self.item_ids = item_ids
        self.logger.info('Number of items to double check: %s', len(self.item_ids))

    def start_requests(self):
        self.logger.info('Double check begin')
        for item_id in self.item_ids:
            print(item_id)
            url = 'http://quotes.toscrape.com/page/0{}/'.format(item_id)
            request = scrapy.Request(url, callback=self.parse)
            request.meta['item_id'] = item_id

            yield request

    def parse(self, response):
        item = MissingItem()
        item['item_id'] = response.meta['item_id']
        yield item

我的自定义扩展:

from scrapy import signals
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from missing.spiders.missing_spider import S2Spider

class ext1(object):

    def __init__(self):
        self.old_products = [1,2,3,6,7]
        self.new_products = []

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls()

        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def item_scraped(self, item):
        self.new_products.append(item['item_id'])
        print('item scraped: {}'.format(item['item_id']))

    def spider_closed(self):
        print('Number of products scraped: {}'.format(len(self.new_products)))
        diff = set(self.old_products) - set(self.new_products)
        print('Diff: {}'.format(diff))

        if len(diff):
            runner = CrawlerRunner(get_project_settings())

            d = runner.crawl(S2Spider, item_ids=list(diff))
            d.addBoth(lambda _: reactor.stop())
            # reactor.run()

class ext2(object):

    def __init__(self):
        self.new_products = []

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls()

        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def item_scraped(self, item):
        self.new_products.append(item['item_id'])
        print('item s2 scraped: {}'.format(item['item_id']))

    def spider_closed(self):
        print('Number of products s2 scraped: {}'.format(len(self.new_products)))
        print(self.new_products)

我的spider2日志:

2017-04-24 07:14:16 [scrapy.middleware]信息:启用扩展:['scrapy.extensions.logstats.LogStats','scrapy.extensions.corestats.CoreStats','scrapy.extensions.telnet.TelnetConsole '] 2017-04-24 07:14:16 [s2]信息:要仔细检查的项目数量:2 2017-04-24 07:14:16 [scrapy.middleware]信息:启用下载中间件:['scrapy . downloadermiddlewares.robotstxt.RobotsTxtMiddleware','scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware','scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware','scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware','scrapy.downloadermiddlewares.useragent.UserAgentMiddleware','scrapy . downloadermiddlewares.retry.RetryMiddleware','scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware','scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware','scrapy.downloadermiddlewares.redirect.RedirectMiddleware','scrapy.downloadermiddlewares.cookies.CookiesMiddleware','scrapy . downloadermiddlew ares.stats.DownloaderStats'] 2017-04-24 07:14:16 [scrapy.middleware]信息:启用蜘蛛中间件:['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware','scrapy.spidermiddlewares.offsite.OffsiteMiddleware',' scrapy.spidermiddlewares.referer.RefererMiddleware','scrapy.spidermiddlewares.urllength.UrlLengthMiddleware','scrapy.spidermiddlewares.depth.DepthMiddleware'] 2017-04-24 07:14:16 [scrapy.middleware]信息:启用项目管道: [] 2017-04-24 07:14:16 [scrapy.core.engine]信息:蜘蛛打开2017-04-24 07:14:16 [scrapy.extensions.logstats]信息:抓0页(0页/ min),刮0项(0项/分)2017-04-24 07:14:16 [scrapy.extensions.telnet] DEBUG:telnet控制台监听127.0.0.1:6024 2017-04-24 07:14: 16 [scrapy.statscollectors]信息:倾倒Scrapy统计:{'downloader / request_bytes':1347,'downloader / request_count':6,'downloader / request_method_count / GET':6,'downloader / response_bytes':12648,'downloader / response_count':6,'下载/响应_status_count / 200':5,'downloader / response_status_count / 404':1,'finish_reason':'完成','finish_time':datetime.datetime(2017,4,24,7,14,16,614533),'item_scraped_count ':5,'log_count / DEBUG':13,'log_count / INFO':14,'response_received_count':6,'scheduler / dequeued':5,'scheduler / dequeued / memory':5,'scheduler / enqueued': 5,'scheduler / enqueued / memory':5,'start_time':datetime.datetime(2017,4,24,7,14,15,219602)} 2017-04-24 07:14:16 [scrapy.core . 发动机]信息:蜘蛛关闭(完成)2017-04-24 07:14:16 [s2]信息:双重检查开始