'im getting empty exported file scrapy
I'm trying write a parser to crawl, but something is going wrong, can help me whats wrong? I linked spider with items.py
import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
"/descendant::text()").extract()
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item
Solution 1:[1]
I included the class from your items in the scraper so that I could dissect what you did. It's all essentially the same as your items.py.
It turned out that you had a few issues with you selectors, and you weren't selecting all the text. You needed getall() appended to recipe instead of extract().
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
loaders.add_value('recipe',text_stuff)
yield loaders.load_item()
Output:
{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
'recipe': ['????? ???????? ???????. ????? ????? ? ????????????, ? '
'??????????????, ? ?????? ?????.',
'? ????????? ????????? ????????? ?????.'],
'title': '??????? ? ???????'}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Working dollar |
