首页 文章

Scrapy不使用mongodb?

提问于
浏览
0
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.http import TextResponse
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time


class Product(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    data = scrapy.Field()
    name_reviewer = scrapy.Field()
    date = scrapy.Field()
    model_name = scrapy.Field()
    rating = scrapy.Field()
    review = scrapy.Field()
    url_print = scrapy.Field()


class FooSpider(CrawlSpider):
    name = "snap_reviews"
    allowed_domains = ["snapdeal.com"]
    url=[]
    ids  = ['http://www.snapdeal.com/product/micromax-a114-canvas-22-black/1485635784']
    for id in ids:
        for i in range(1,25):
            url.append(id+'/reviews?page='+str(i)+'&vsrc=rcnt')
    start_urls = url

    def __init__(self, *args, **kwargs):
        super(FooSpider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(60) # 

    def parse(self,response):
        self.browser.get(response.url)
        sites = response.xpath('//div[@class="reviewareain"]/div/div')
        #self.browser.implicitly_wait(30)
        items = []


        #sel = Selector(text=self.browser.page_source)
        #ites = sel.xpath('//div[@class="reviewareain"]')
        model = response.xpath('//span[contains(@class,"section-head customer_review_tab")]/text()').extract()[0].lstrip()
        for site in sites:
            item = Product()
            item['model_name'] = model
            item['name_reviewer'] = site.xpath('.//span[contains(@class,"_reviewUserName")]/text()').extract()[0]
            item['review'] = site.xpath('.//div[contains(@class, "user-review")]/p/text()').extract()[0]
            item['title'] = site.xpath('.//div[contains(@class,"user-review")]/div[@class="head"]/text()').extract()[0]
            #item['date'] = site.xpath('.//span[contains(@style,"vertical-align:middle;")]/nobr/text()').extract()[0]
            #item['rating'] = site.xpath('.//span[contains(@class,"swSprite")]/@title').extract()[0].split()[0]
            items.append(item)
            return items

管道

import pymongo
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy import log


class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.MongoClient(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]
    self.collection.get(type(item)).insert(item, continue_on_error=True)

    def process_item(self, item, spider):
        self.collection.insert(item)
        log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
            settings['MONGODB_DATABASE'],
            settings['MONGODB_COLLECTION'],
            settings['MONGODB_HOST'],
            settings['MONGODB_PORT']))
        return item

设置

# -*- coding: utf-8 -*-

# Scrapy settings for snapdeal_review_13jul project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'snapdeal_review_13jul'

SPIDER_MODULES = ['snapdeal_review_13jul.spiders']
NEWSPIDER_MODULE = 'snapdeal_review_13jul.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'snapdeal_review_13jul (+http://www.yourdomain.com)'

ITEM_PIPELINES = {'snapdeal_review_13jul.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "snapdeal_reviews" # Change in prod
MONGODB_COLLECTION = "snap_r_1"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

错误:

Traceback(最近一次调用最后一次):文件“/usr/lib/python2.7/dist-packages/scrapy/middleware.py”,第62行,在_process_chain中返回process_chain(self.methods [methodname],obj,* args )文件“/usr/lib/python2.7/dist-packages/scrapy/utils/defer.py”,第65行,在process_chain d.callback(输入)文件“/usr/lib/python2.7/dist-packages /twisted/internet/defer.py“,第382行,在回调self._startRunCallbacks(结果)文件”/usr/lib/python2.7/dist-packages/twisted/internet/defer.py“,第490行,在_startRunCallbacks中self._runCallbacks()--- ---文件“/usr/lib/python2.7/dist-packages/twisted/internet/defer.py”,第577行,在_runCallbacks中current.result = callback(current.result, * args,** kw)文件“/home/nikhil/Desktop/Scrapers/mouth/mouth/pipelines.py”,第22行,在process_item中self.collection.insert(item)文件“/home/nikhil/.local/ lib / python2.7 / site-packages / pymongo / collection.py“,第1926行,插入check_keys,操作,写入_concern)文件”/home/nikhil/.local/lib/python2.7/site-packages/p ymongo / collection.py“,第430行,在_insert gen(),check_keys,self.codec_options,sock_info)文件”/home/nikhil/.local/lib/python2.7/site-packages/pymongo/collection.py“ ,第405行,在gen doc ['_ id'] = ObjectId()exceptions.TypeError:'str'对象不支持项目赋值

我当前的代码工作较早,但现在没有工作抛出上述错误 .

1 回答

  • 0

    尝试将项目转换为dict,如下所示:

    self.collection.insert(dict(item))
    

    看看是否有效 .

相关问题