'How to scrape a webpage which is generating ajax url request using scrapy
I need to scrape a webpage https://mines.rajasthan.gov.in/DMG2/Public/eRawannaStatus/VHJF1015707850 , which after loading is generating a ajax url to fetch the data. I tried selenium also but that is taking hours to scrape few 50-100 urls. So i am trying this in Scrapy, but still it is showing error 'HTTP status code
is not handled or not allowed'. Please help. The code is:
import urllib.parse
import scrapy
from scrapy import FormRequest, Request
from datetime import datetime
class MytpSpider(scrapy.Spider):
name = 'myTP'
# allowed_domains = ['mines.rajasthan.gov.in']
start_urls = ['https://mines.rajasthan.gov.in/DMG2/Public/eRawannaStatus/VHJF1015707850']
def parse(self, response):
yield (response.body)
token = response.xpath('//*[@id="chaDepo"]/input[2]/@value').extract_first()
tpID = response.xpath('//*[@id="tpId"]/@value').extract_first()
dealerID = response.xpath('//*[@id="dealerId"]/@value').extract_first()
print(token,tpID,dealerID,end='\n')
dt = datetime.now()
dt_string = str(dt.strftime("%Y-%m-%d %H:%M:%S"))
form = {'requestData': f"{{'TransitPassID':{tpID},'DealerEnrollId':{dealerID},'stockLocId':'0','fromDate':{dt_string},'toDate':{dt_string},'FetchOption':'14'}}"}
payload = urllib.parse.urlencode(form).replace('+','%20').replace('%27','%22')
headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
'Content-type': 'application/x-www-form-urlencoded',
'Host': 'mines.rajasthan.gov.in',
'Origin': 'https://mines.rajasthan.gov.in',
'Referer': 'https://mines.rajasthan.gov.in/DMG2/Public/eRawannaStatus/VHJF1015707850',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
'X-CSRF-TOKEN': token}
r = Request(url='https://mines.rajasthan.gov.in/DMG2/DMGService/GetTransitPass',method='POST',headers=headers,body=payload,callback= self.tab1)
# r = FormRequest(url='https://mines.rajasthan.gov.in/DMG2/DMGService/GetTransitPass',method='POST',headers=headers,formdata=form,callback= self.tab1)
yield(r)
def tab1(self,response):
t1 = response.xpath('//*[@id="transistPassPrintFrm"]/div[1]/div/div/div/div/div/div[1]/table/tbody/tr[1]/td/table[1]/tbody/tr[2]/td/div/table/tbody/tr/td/strong[1]').extract_first()
print(t1) #should print 'RJ13GB5139'
# pass
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
