'Web Scraping Journal "El Peruano" - Python/Scrapy

im trying to scrap some info from "El Peruano" journal, but i cannot at first sight it look have to:

Put a Date in a Formbox.
Do a click in SearchBox.
Get all links for get all: "Title","Resolution#", "Body"

This is my code:

import scrapy

class SpiderPeruano(scrapy.Spider):
    name = "peruano"
    start_urls = [
        "https://diariooficial.elperuano.pe/Normas"
    ]

    custom_settings= {
        "FEED_URI": "peruano.json",
        "FEED_FORMAT": "json",
        "FEED_EXPORT_ENCODING": "utf-8"
    }

    def parse_click(self, response):
        
        #i put here a condition but i think is not necessary
        #button = response.xpath("//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']").get()
        #if buttom:
        yield scrapy.FormRequest.from_response(
            response,
            formxpath= "//form[@id='space_PortalNormasLegalesN']",
            formdata={"cddesde": "08/03/2022", "cdhasta:": "08/03/2022"},
            dont_click=True,
            dont_filter=True,
            callback=self.parse
            )
 
            
    def parse(self, response):
    
        links = response.xpath("//div[@class='ediciones_texto']/h5/a/@href").getall()
        for link in links:
            yield response.follow(link, callback=self.parse_link)


    def parse_link(self, response):
        title = response.xpath("//div[@class='story']/h1[@class='sumilla']/text()").get()
        num = response.xpath("//div[@class='story']/h2[@class='resoluci-n']/text()").getall()
        body = response.xpath("//div[@class='story']/p/text()").getall()

        yield {
            "title": title,
            "num": num,
            "body": body
        }

#call
#scrapy crawl peruano

#url = "https://diariooficial.elperuano.pe/normas"

#Form_BOX: "//form[@action]"
#Box_desde = "//form[@action]/input[@id='cddesde']"
#Box_hasta = "//form[@action]/input[@id='cdhasta']"
#Button= "//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']"

#links = "//div[@class='ediciones_texto']/h5/a/@href"
#titles= "//div[@class='story']/h1[@class='sumilla']/text()"
#resolutionNum= "//div[@class='story']/h2[@class='resoluci-n']/text()"
#body= "//div[@class='story']/p/text()"

So, i need some help for know what i'm doing wrong on my code cuz this run well but dont get the data.

Thx a lot for your time and help!

Solution 1:^[1]

I found two mistakes:

First:

Scrapy gets url from start_urls and sends response to parse (as default callback) but you expect it in parse_click (to send Form). If I rename functions then it sends form.

Second:

Small typo. In formdata= you use string "cdhasta:" with : at the end and this made problems.

import scrapy

class SpiderPeruano(scrapy.Spider):
    name = "peruano"
    
    start_urls = [
        "https://diariooficial.elperuano.pe/Normas"
    ]

    custom_settings= {
        "FEED_URI": "peruano.json",
        "FEED_FORMAT": "json",
        "FEED_EXPORT_ENCODING": "utf-8"
    }

    def parse(self, response):
        print('[parse] url:', response.url)

        yield scrapy.FormRequest.from_response(
                    response,                    
                    formxpath= "//form[@id='space_PortalNormasLegalesN']",
                    formdata={"cddesde": "01/03/2022", "cdhasta": "03/03/2022", "btnBuscar":""},
                    dont_click=True,
                    dont_filter=True,
                    #headers={'Referer':"https://diariooficial.elperuano.pe/Normas", 'X-Requested-With': 'XMLHttpRequest'},
                    callback=self.parse_result
               )

    def parse_result(self, response):
        print('[parse_result] url:', response.url)
    
        links = response.xpath("//div[@class='ediciones_texto']/h5/a/@href").getall()
        for link in links:
            yield response.follow(link, callback=self.parse_link)


    def parse_link(self, response):
        print('[parse_link] url:', response.url)
        
        title = response.xpath("//div[@class='story']/h1[@class='sumilla']/text()").get()
        num = response.xpath("//div[@class='story']/h2[@class='resoluci-n']/text()").getall()
        body = response.xpath("//div[@class='story']/p/text()").getall()

        yield {
            "title": title,
            "num": num,
            "body": body
        }

# --- run without project ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    # save in file CSV, JSON or XML
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(SpiderPeruano)
c.start()

EDIT:

Meanwhile I tested it also with requests but I didn't try to get links from response to search details.

import requests

# --- GET ---

headers = {
#    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
}

url = 'https://diariooficial.elperuano.pe/Normas'

response = requests.get(url, headers=headers)
print(response)

# --- POST ---

url = 'https://diariooficial.elperuano.pe/Normas/Filtro?dateparam=03/08/2022 00:00:00'

params = {
    'cddesde': '01/03/2022',
    'cdhasta': '03/03/2022',
#    'X-Requested-With': 'XMLHttpRequest',
}

headers = {
#    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
#    'Referer': "https://diariooficial.elperuano.pe/Normas",
#    'X-Requested-With': 'XMLHttpRequest'
}

response = requests.post(url, data=params, headers=headers)
print(response)
print(response.text[:1000])

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1

'Web Scraping Journal "El Peruano" - Python/Scrapy

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]