'Scrapy doesnt save data [closed]
I am just learning web scraping and using ScrapiApi for proxy, I followed tutorial here, but not able to save any results
Here is the output from my terminal after running the commands
scrapy crawl amazon -O test.json
scrapy crawl amazon -o test.csv
It does create the file but nothing on it.Any ideas on these?
2022-04-22 13:05:10 [urllib3.connectionpool] DEBUG: https://api.scrapeops.io:443
"POST /api/v1/stats/ HTTP/1.1" 200 125
2022-04-22 13:05:10 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 921,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 392815,
'downloader/response_count': 3,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/404': 1,
'elapsed_time_seconds': 68.034077,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 4, 22, 5, 5, 3, 370852),
'httpcompression/response_bytes': 3655673,
'httpcompression/response_count': 3,
'log_count/DEBUG': 25,
'log_count/INFO': 11,
'log_count/WARNING': 2,
'memusage/max': 86188032,
'memusage/startup': 61722624,
'offsite/domains': 1,
'offsite/filtered': 173,
'request_depth_max': 1,
'response_received_count': 3,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/404': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2022, 4, 22, 5, 3, 55, 336775)}
2022-04-22 13:05:10 [scrapy.core.engine] INFO: Spider closed (finished)
I am using scrapy version Scrapy 2.6.1
Spider code
import json
import re
from urllib.parse import urlencode, urljoin
import scrapy
import logging
queries = ["tshirt for men", "tshirt for women"]
API_KEY = "2034ea658714949cfe70ac954b5d2***"
def get_url(url):
payload = {"api_key": API_KEY, "url": url, "country_code": "us"}
proxy_url = "http://api.scraperapi.com/?" + urlencode(payload)
return proxy_url
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = ["http://amazon.com/"]
# def parse(self, response):
# pass
def start_requests(self):
for query in queries:
url = "https://www.amazon.com/s?" + urlencode({"k": query})
yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response)
def parse_keyword_response(self, response):
products = response.xpath("//*[@data-asin]")
for product in products:
asin = product.xpath("@data-asin").extract_first()
product_url = f"https://www.amazon.com/dp/{asin}"
yield scrapy.Request(
url=get_url(product_url),
callback=self.parse_product_page,
meta={"asin": asin},
)
next_page = response.xpath('//li[@class="a-last"]/a/@href').extract_first()
if next_page:
url = urljoin("https://www.amazon.com", next_page)
yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response)
def parse_product_page(self, response):
logging.warning("this is called")
asin = response.meta["asin"]
title = response.xpath('//*[@id="productTitle"]/text()').extract_first()
image = re.search('"large":"(.*?)"', response.text).groups()[0]
rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first()
number_of_reviews = response.xpath(
'//*[@id="acrCustomerReviewText"]/text()'
).extract_first()
price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
if not price:
price = (
response.xpath("//*[@data-asin-price]/@data-asin-price").extract_first()
or response.xpath(
'//*[@id="price_inside_buybox"]/text()'
).extract_first()
)
temp = response.xpath('//*[@id="twister"]')
sizes = []
colors = []
if temp:
s = re.search('"variationValues" : ({.*})', response.text).groups()[0]
json_acceptable = s.replace("'", '"')
di = json.loads(json_acceptable)
sizes = di.get("size_name", [])
colors = di.get("color_name", [])
bullet_points = response.xpath(
'//*[@id="feature-bullets"]//li/span/text()'
).extract()
seller_rank = response.xpath(
'//*[text()="Amazon Best Sellers Rank:"]/parent::*//text()[not(parent::style)]'
).extract()
yield {
"asin": asin,
"Title": title,
"MainImage": image,
"Rating": rating,
"NumberOfReviews": number_of_reviews,
"Price": price,
"AvailableSizes": sizes,
"AvailableColors": colors,
"BulletPoints": bullet_points,
"SellerRank": seller_rank,
}
I just noticed from the terminal
2022-04-22 20:50:47 [scrapy.extensions.logstats] INFO: Crawled 2 pages (at 2 pages/min), scraped 0 items (at 0 items/min)
UPDATE, after some testing, I noticed, in this line of code
yield scrapy.Request(
get_url(product_url),
callback=self.parse_product_page,
meta={"asin": asin},
)
Its not firing the callback=self.parse_product_page
def parse_product_page(self, response):
logging.warning("this is called") // not logging in terminal
Update: Adding dont_filter=True seems works but as a new in web scraping, I dont know what would be the consequenses . Thus anyone have idea how to improve this piece of code ?
yield scrapy.Request(
url=get_url(product_url),
callback=self.parse_product_page,
dont_filter=True,
meta={"asin": asin},
)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
