'Scraping information from previous pages using LinkExtractors
I wanted to know if it is possible to scrape information from previous pages using LinkExtractors. This question is in relation to my previous question here
I have uploaded the answer to that question with a change to the xpath for country. The xpath provided, grabs the countries from the first page.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(@class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
However, this prints out the following output:
'country':'(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()'
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
