'Getting each text corresponding to the each tag
I'm trying to grab some data from the left-side column of a webpage. The aim is to click on all the show more buttons using scrapy_playwright, and grab the title of each the elements belonging to the show more list. However, when I run my scraper it iterates the same header make for all of the lists. I need to get these unique for each set of lists.
Here's my scraper:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
for i in range(0, 11):
yield scrapy.Request(
url = url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
PageCoroutine("wait_for_timeout", 5000),
]
),
)
def parse(self, response):
container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
for items in container:
for values in test:
loader = ItemLoader(ConfusedItem(), selector = items)
loader.add_xpath('clicks', './/@onclick')
loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'json_data.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ConfusedSpider)
process.start()
Output:
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
Expected output:
{'category': 'SELLER TYPE',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
