Почему scrapy не сохраняет данные в mongodb?

Мой основной файл:

import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request

class Product(scrapy.Item):
    brand = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    name = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    heading = scrapy.Field()
    data = scrapy.Field()
    Model_name = scrapy.Field()

class aqaqspider(CrawlSpider):
    name = "mouth_shut_new"
    allowed_domains = ["mouthshut.com"]
    start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"

    rules = (

    def parse_start_url(self, response):
        products = response.xpath('//div[@id="allreviews"]/ul/li')
        items = []
        if not products:
            raise CloseSpider("No more products!")

        for product in products:
            item = Product()
            #item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
            item['name'] = product.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
            item['title'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
            item['date'] = product.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
            item['link'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield scrapy.Request(item['link'],
                                    meta={'item': item},


    def anchor_page(self, response):
        old_item = response.request.meta['item']

        old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
        yield old_item

    # yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
                      # headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
                      # callback=self.parse, 
                      # dont_filter=True)

Мои настройки.py:

# -*- coding: utf-8 -*-

# Scrapy settings for mouth project
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#     http://doc.scrapy.org/en/latest/topics/settings.html

BOT_NAME = 'mouth'

SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'

ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}

MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'

Мои пайплайны.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo
from scrapy.conf import settings
from scrapy import log

class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

def process_item(self, item, spider):
    log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
    return item

Я побежал грязно scrapy crawl mouth_shut_new. Но мои данные не хранились в базе данных. В выводе должно быть видно, что данные хранятся в монго и название коллекции. Что мне не хватает?

person John Dene    schedule 08.06.2015    source источник

Ответы (2)

process_item() не имеет правильного отступа, должно быть:

class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
        return item
person alecxe    schedule 08.06.2015
Еще раз спасибо @alecxe, сэр !! - person John Dene; 09.06.2015

Вы не передали элемент в функции обратного вызова: callback="parse_start_url", Вы должны сделать это следующим образом:

def parse_start_ul(self, response):

    for product in products:
        item = Product()
        yield item

person user2927218    schedule 08.06.2015
Я уже использую запрос yield для вызова другой функции, которая будет вводить другую ссылку. - person John Dene; 08.06.2015
о, попробовал твой код. ваш код отлично работает на моем компьютере. однако я использую pymongo.MongoClient вместо pymongo.Connection. - person user2927218; 08.06.2015
Я также перешел на pymongo.MongoClient, но все еще не смог получить данные в БД - person John Dene; 08.06.2015