'Web Scraping Journal "El Peruano" - Python/Scrapy
im trying to scrap some info from "El Peruano" journal, but i cannot at first sight it look have to:
- Put a Date in a Formbox.
- Do a click in SearchBox.
- Get all links for get all: "Title","Resolution#", "Body"
This is my code:
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse_click(self, response):
#i put here a condition but i think is not necessary
#button = response.xpath("//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']").get()
#if buttom:
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[@id='space_PortalNormasLegalesN']",
formdata={"cddesde": "08/03/2022", "cdhasta:": "08/03/2022"},
dont_click=True,
dont_filter=True,
callback=self.parse
)
def parse(self, response):
links = response.xpath("//div[@class='ediciones_texto']/h5/a/@href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
title = response.xpath("//div[@class='story']/h1[@class='sumilla']/text()").get()
num = response.xpath("//div[@class='story']/h2[@class='resoluci-n']/text()").getall()
body = response.xpath("//div[@class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
#call
#scrapy crawl peruano
#url = "https://diariooficial.elperuano.pe/normas"
#Form_BOX: "//form[@action]"
#Box_desde = "//form[@action]/input[@id='cddesde']"
#Box_hasta = "//form[@action]/input[@id='cdhasta']"
#Button= "//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']"
#links = "//div[@class='ediciones_texto']/h5/a/@href"
#titles= "//div[@class='story']/h1[@class='sumilla']/text()"
#resolutionNum= "//div[@class='story']/h2[@class='resoluci-n']/text()"
#body= "//div[@class='story']/p/text()"
So, i need some help for know what i'm doing wrong on my code cuz this run well but dont get the data.
Thx a lot for your time and help!
Solution 1:[1]
I found two mistakes:
First:
Scrapy gets url from start_urls and sends response to parse (as default callback) but you expect it in parse_click (to send Form). If I rename functions then it sends form.
Second:
Small typo. In formdata= you use string "cdhasta:" with : at the end and this made problems.
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse(self, response):
print('[parse] url:', response.url)
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[@id='space_PortalNormasLegalesN']",
formdata={"cddesde": "01/03/2022", "cdhasta": "03/03/2022", "btnBuscar":""},
dont_click=True,
dont_filter=True,
#headers={'Referer':"https://diariooficial.elperuano.pe/Normas", 'X-Requested-With': 'XMLHttpRequest'},
callback=self.parse_result
)
def parse_result(self, response):
print('[parse_result] url:', response.url)
links = response.xpath("//div[@class='ediciones_texto']/h5/a/@href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
print('[parse_link] url:', response.url)
title = response.xpath("//div[@class='story']/h1[@class='sumilla']/text()").get()
num = response.xpath("//div[@class='story']/h2[@class='resoluci-n']/text()").getall()
body = response.xpath("//div[@class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
# --- run without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(SpiderPeruano)
c.start()
EDIT:
Meanwhile I tested it also with requests but I didn't try to get links from response to search details.
import requests
# --- GET ---
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
}
url = 'https://diariooficial.elperuano.pe/Normas'
response = requests.get(url, headers=headers)
print(response)
# --- POST ---
url = 'https://diariooficial.elperuano.pe/Normas/Filtro?dateparam=03/08/2022 00:00:00'
params = {
'cddesde': '01/03/2022',
'cdhasta': '03/03/2022',
# 'X-Requested-With': 'XMLHttpRequest',
}
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# 'Referer': "https://diariooficial.elperuano.pe/Normas",
# 'X-Requested-With': 'XMLHttpRequest'
}
response = requests.post(url, data=params, headers=headers)
print(response)
print(response.text[:1000])
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
