'Restrict resource types only to XHR's with playwright
I want to return only the xhr from scrapy_playwright using the playwright_page_event_handlers. After checking the jsonlines file, I find that it has not succesfully restricted to only the xhrs.
I know I can filter before writing the file, however I want to save the amount of time it takes to grab these resources rather than filtering everything after.
How can I restrict the resource types only to xhr?
Here's what I have tried:
from playwright.async_api import Response as PlaywrightResponse, BrowserContext
from scrapy_playwright.page import PageCoroutine
from scrapy import Spider, Request
import jsonlines
class EventSpider(Spider):
name = "event"
def start_requests(self):
yield Request(
url="http://quotes.toscrape.com/scroll",
cookies={"foo": "bar", "asdf": "qwerty"},
meta=dict(
playwright=True,
playwright_page_coroutines = [
PageCoroutine("wait_for_selector", "div.quote"),
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
],
playwright_page_event_handlers={
"response": "handle_response",
"context": self.configure_context
},
),
)
async def configure_context(name: str, context: BrowserContext) -> None:
def handle_route(route):
if ("quotes" in route.request.post_data):
route.fulfill()
else:
route.continue_()
await context.route("/api/**", handle_route)
async def handle_response(self, response: PlaywrightResponse) -> None:
jl_file = "test.jl"
data = {response.request.resource_type:[response.request.url]}
with jsonlines.open(jl_file, mode='a') as writer:
writer.write(data)
def parse(self, response):
return {"url": response.url}
Produces the following output:
{"document": ["http://quotes.toscrape.com/scroll"]}
{"stylesheet": ["http://quotes.toscrape.com/static/bootstrap.min.css"]}
{"stylesheet": ["http://quotes.toscrape.com/static/main.css"]}
{"script": ["http://quotes.toscrape.com/static/jquery.js"]}
{"stylesheet": ["https://fonts.googleapis.com/css?family=Raleway:400,700"]}
{"font": ["https://fonts.gstatic.com/s/raleway/v26/1Ptug8zYS_SKggPNyC0IT4ttDfA.woff2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=1"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=3"]}
Expected output:
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=1"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=3"]}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
