'{'productname': None, 'productphoto': None, 'productprice': None}. Css selector is not returning anything in scrapy project
I am new to scrapy framework. I am trying to scrape data from a online grocery shop. I used Scrapy and selenium together to fetch data. In def start_request() I selected a href value which has everything that I need to fetch. then I pass it to the parse() method and perform scrapy operation. In the console log I can see the href value is being crawled but the information I want to fetch from that is not here. It shows item scraped count:40 but there is no value in it {'productname': None, 'productphoto': None, 'productprice': None}. Now I don't know what is the problem.
My spider file:
from ast import parse
from gc import callbacks
import imp
from tkinter.messagebox import NO
from turtle import title
from unicodedata import name
from unittest.mock import call
import scrapy
from ..items import BestdealscraperItem
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import schedule
import time
from flask import request
import json
from selenium.webdriver import Chrome, ChromeOptions
from scrapy.utils.project import get_project_settings
class productspider(scrapy.Spider):
name = 'products'
def start_requests(self):
settings = get_project_settings()
driver_path = settings.get('CUSTOM_DRIVER_PATH')
options = ChromeOptions()
options.headless = True
driver = Chrome(executable_path=driver_path, options=options)
driver.get('https://www.meenaclick.com/category/fish')
selector = '//a[@class="main-link ng-star-inserted"]'
link_elements = driver.find_elements_by_xpath(selector)
for link_el in link_elements:
href = link_el.get_attribute('href')
yield scrapy.Request(href)
driver.quit()
def parse(self, response):
items = BestdealscraperItem()
product_name = response.xpath('//h1[@class="title"]').get()
product_price = response.xpath('//div[@class="reg-price"]').get()
product_photo = response.xpath('//img[@class="main-img p-md rounded-s"]').get()
items['productname'] = product_name
items['productprice'] = product_price
items['productphoto'] = product_photo
yield items
# product_link = response.css('.mtb-title').extract()
My settings.py file:
# Scrapy settings for bestdealscraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bestdealscraper'
SPIDER_MODULES = ['bestdealscraper.spiders']
NEWSPIDER_MODULE = 'bestdealscraper.spiders'
#newwright
# DOWNLOAD_HANDLERS = {
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# }
# TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
CUSTOM_DRIVER_PATH = r'C:\Users\Technbit\Downloads\chromedriver_win32\chromedriver'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bestdealscraper (+http://www.yourdomain.com)'
#USER_AGENT = 'https://developers.whatismybrowser.com/useragents/parse/79googlebot'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'bestdealscraper.middlewares.BestdealscraperSpiderMiddleware': 543,
#}
# DOWNLOADER_MIDDLEWARES = {
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'bestdealscraper.middlewares.BestdealscraperDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bestdealscraper.pipelines.BestdealscraperPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Items.py file:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BestdealscraperItem(scrapy.Item):
# define the fields for your item here like:
productname = scrapy.Field()
productprice = scrapy.Field()
productphoto = scrapy.Field()
# productlink = scrapy.Field()
# title = scrapy.Field()
# author = scrapy.Field()
# tags = scrapy.Field()
middleware.py file:
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class BestdealscraperSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BestdealscraperDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipeline.py file:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import mysql.connector
class BestdealscraperPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = 'imnumber47',
database = 'crawledata',
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS bestdealproductdb""")
self.curr.execute("""
create table bestdealproductdb(
productname text,
productprice int,
productphoto text
)""")
def process_item(self, item, spider):
self.store_db(item)
# print('pipeline:' + item['productname'][0])
return item
def store_db(self, item):
self.curr.execute("""insert into bestdealproductdb values(%s,%s,%s) """,(
item['productname'],
item['productprice'],
item['productphoto']
)
)
self.conn.commit()
My console log:
llegal TouchEventStrategy_ value (this is a bug)\");return l};function X(a,b,c){ja(this,a,b,c)}m(X,Oc);\nX.prototype.create=function(a,b){if(!Nc)throw new U(9,\"Browser does not support MSGesture events.\");var c=A(a);a=c?c.parentWindow||c.defaultView:window;c=c.createEvent(\"MSGestureEvent\");c.initGestureEvent(this.a,this.b,this.f,a,1,0,0,b.clientX,b.clientY,0,0,b.translationX,b.translationY,b.scale,b.expansion,b.rotation,b.velocityX,b.velocityY,b.velocityExpansion,b.velocityAngular,(new Date).getTime(),b.relatedTarget);return c};function Y(a,b,c){ja(this,a,b,c)}m(Y,Oc);\nY.prototype.create=function(a,b){if(!Nc)throw new U(9,\"Browser does not support MSPointer events.\");var c=A(a);a=c?c.parentWindow||c.defaultView:window;c=c.createEvent(\"MSPointerEvent\");c.initPointerEvent(this.a,this.b,this.f,a,0,0,0,b.clientX,b.clientY,b.ctrlKey,b.altKey,b.shiftKey,b.metaKey,b.button,b.relatedTarget,0,0,b.width,b.height,b.pressure,b.rotation,b.tiltX,b.tiltY,b.pointerId,b.pointerType,0,b.isPrimary);return c};new W(\"click\",!0,!0);new W(\"contextmenu\",!0,!0);new W(\"dblclick\",!0,!0);\nnew W(\"mousedown\",!0,!0);new W(\"mousemove\",!0,!1);new W(\"mouseout\",!0,!0);new W(\"mouseover\",!0,!0);new W(\"mouseup\",!0,!0);var Qc=new W(t?\"DOMMouseScroll\":\"mousewheel\",!0,!0),Pc=new W(\"MozMousePixelScroll\",!0,!0);new Rc(\"keydown\",!0,!0);var Sc=new Rc(\"keypress\",!0,!0);new Rc(\"keyup\",!0,!0);new Tc(\"touchend\",!0,!0);new Tc(\"touchmove\",!0,!0);new Tc(\"touchstart\",!0,!0);new X(\"MSGestureChange\",!0,!0);new X(\"MSGestureEnd\",!0,!0);new X(\"MSGestureHold\",!0,!0);new X(\"MSGestureStart\",!0,!0);\nnew X(\"MSGestureTap\",!0,!0);new X(\"MSInertiaStart\",!0,!0);new Y(\"MSGotPointerCapture\",!0,!1);new Y(\"MSLostPointerCapture\",!0,!1);new Y(\"MSPointerCancel\",!0,!0);new Y(\"MSPointerDown\",!0,!0);new Y(\"MSPointerMove\",!0,!0);new Y(\"MSPointerOver\",!0,!0);new Y(\"MSPointerOut\",!0,!0);new Y(\"MSPointerUp\",!0,!0);function Uc(a,b){this.b={};this.a=[];this.f=0;var c=arguments.length;if(1<c){if(c%2)throw Error(\"Uneven number of arguments\");for(var d=0;d<c;d+=2)this.set(arguments[d],arguments[d+1])}else if(a)if(a instanceof Uc)for(c=Vc(a),d=0;d<c.length;d++)this.set(c[d],a.get(c[d]));else for(d in a)this.set(d,a[d])}\nfunction Vc(a){if(a.f!=a.a.length){for(var b=0,c=0;b<a.a.length;){var d=a.a[b];Object.prototype.hasOwnProperty.call(a.b,d)&&(a.a[c++]=d);b++}a.a.length=c}if(a.f!=a.a.length){var e={};for(c=b=0;b<a.a.length;)d=a.a[b],Object.prototype.hasOwnProperty.call(e,d)||(a.a[c++]=d,e[d]=1),b++;a.a.length=c}return a.a.concat()}Uc.prototype.get=function(a,b){return Object.prototype.hasOwnProperty.call(this.b,a)?this.b[a]:b};\nUc.prototype.set=function(a,b){Object.prototype.hasOwnProperty.call(this.b,a)||(this.f++,this.a.push(a));this.b[a]=b};var Wc={};function Z(a,b,c){da(a)&&(a=t?a.g:a.h);a=new Xc(a);!b||b in Wc&&!c||(Wc[b]={key:a,shift:!1},c&&(Wc[c]={key:a,shift:!0}));return a}function Xc(a){this.code=a}Z(8);Z(9);Z(13);var Yc=Z(16),Zc=Z(17),$c=Z(18);Z(19);Z(20);Z(27);Z(32,\" \");Z(33);Z(34);Z(35);Z(36);Z(37);Z(38);Z(39);Z(40);Z(44);Z(45);Z(46);Z(48,\"0\",\")\");Z(49,\"1\",\"!\");Z(50,\"2\",\"@\");Z(51,\"3\",\"#\");Z(52,\"4\",\"$\");Z(53,\"5\",\"%\");Z(54,\"6\",\"^\");Z(55,\"7\",\"&\");Z(56,\"8\",\"*\");Z(57,\"9\",\"(\");Z(65,\"a\",\"A\");Z(66,\"b\",\"B\");Z(67,\"c\",\"C\");Z(68,\"d\",\"D\");\nZ(69,\"e\",\"E\");Z(70,\"f\",\"F\");Z(71,\"g\",\"G\");Z(72,\"h\",\"H\");Z(73,\"i\",\"I\");Z(74,\"j\",\"J\");Z(75,\"k\",\"K\");Z(76,\"l\",\"L\");Z(77,\"m\",\"M\");Z(78,\"n\",\"N\");Z(79,\"o\",\"O\");Z(80,\"p\",\"P\");Z(81,\"q\",\"Q\");Z(82,\"r\",\"R\");Z(83,\"s\",\"S\");Z(84,\"t\",\"T\");Z(85,\"u\",\"U\");Z(86,\"v\",\"V\");Z(87,\"w\",\"W\");Z(88,\"x\",\"X\");Z(89,\"y\",\"Y\");Z(90,\"z\",\"Z\");var ad=Z(Ia?{g:91,h:91}:Ha?{g:224,h:91}:{g:0,h:91});Z(Ia?{g:92,h:92}:Ha?{g:224,h:93}:{g:0,h:92});Z(Ia?{g:93,h:93}:Ha?{g:0,h:0}:{g:93,h:null});Z({g:96,h:96},\"0\");Z({g:97,h:97},\"1\");\nZ({g:98,h:98},\"2\");Z({g:99,h:99},\"3\");Z({g:100,h:100},\"4\");Z({g:101,h:101},\"5\");Z({g:102,h:102},\"6\");Z({g:103,h:103},\"7\");Z({g:104,h:104},\"8\");Z({g:105,h:105},\"9\");Z({g:106,h:106},\"*\");Z({g:107,h:107},\"+\");Z({g:109,h:109},\"-\");Z({g:110,h:110},\".\");Z({g:111,h:111},\"/\");Z(144);Z(112);Z(113);Z(114);Z(115);Z(116);Z(117);Z(118);Z(119);Z(120);Z(121);Z(122);Z(123);Z({g:107,h:187},\"=\",\"+\");Z(108,\",\");Z({g:109,h:189},\"-\",\"_\");Z(188,\",\",\"<\");Z(190,\".\",\">\");Z(191,\"/\",\"?\");Z(192,\"`\",\"~\");Z(219,\"[\",\"{\");\nZ(220,\"\\\\\",\"|\");Z(221,\"]\",\"}\");Z({g:59,h:186},\";\",\":\");Z(222,\"'\",'\"');var bd=new Uc;bd.set(1,Yc);bd.set(2,Zc);bd.set(4,$c);bd.set(8,ad);(function(a){var b=new Uc;n(Vc(a),function(c){b.set(a.get(c).code,c)});return b})(bd);var cd={\"class\":\"className\",readonly:\"readOnly\"},dd=\"allowfullscreen allowpaymentrequest allowusermedia async autofocus autoplay checked compact complete controls declare default defaultchecked defaultselected defer disabled ended formnovalidate hidden indeterminate iscontenteditable ismap itemscope loop multiple muted nohref nomodule noresize noshade novalidate nowrap open paused playsinline pubdate readonly required reversed scoped seamless seeking selected truespeed typemustmatch willvalidate\".split(\" \");ba(\"_\",function(a,b){var c=null,d=b.toLowerCase();if(\"style\"==d)return(c=a.style)&&!aa(c)&&(c=c.cssText),c;if((\"selected\"==d||\"checked\"==d)&&Jc(a)){if(!Jc(a))throw new U(15,\"Element is not selectable\");b=\"selected\";c=a.type&&a.type.toLowerCase();if(\"checkbox\"==c||\"radio\"==c)b=\"checked\";return Ic(a,b)?\"true\":null}var e=Gc(a,\"A\");if(Gc(a,\"IMG\")&&\"src\"==d||e&&\"href\"==d)return(c=Ec(a,d))&&(c=Ic(a,d)),c;if(\"spellcheck\"==d){c=Ec(a,d);if(null!==c){if(\"false\"==c.toLowerCase())return\"false\";if(\"true\"==c.toLowerCase())return\"true\"}return Ic(a,\nd)+\"\"}e=cd[b]||b;if(0<=ma(dd,d))return(c=null!==Ec(a,b)||Ic(a,e))?\"true\":null;try{var f=Ic(a,e)}catch(g){}null==f||da(f)?c=Ec(a,b):c=f;return null!=c?c.toString():null});; return this._.apply(null,arguments);}).apply({navigator:typeof window!='undefined'?window.navigator:null,document:typeof window!='undefined'?window.document:null}, arguments);}\n).apply(null, arguments);", "args": [{"element-6066-11e4-a52e-4f735466cecf": "47de76ad-d043-43bd-aa55-14f06ed24bf4"}, "href"]}
2022-02-25 01:53:11 [urllib3.connectionpool] DEBUG: http://localhost:55679 "POST /session/7e6b8645b8db95a8375f8edb462d2d5a/execute/sync HTTP/1.1" 200 69
2022-02-25 01:53:11 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2022-02-25 01:53:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.meenaclick.com/product/867/grass-carp-fish> (referer: None)
2022-02-25 01:53:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.meenaclick.com/product/910/hilsha-fish-1-1199-kgpc> (referer: None)
2022-02-25 01:53:11 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://localhost:55679/session/7e6b8645b8db95a8375f8edb462d2d5a {}
2022-02-25 01:53:11 [urllib3.connectionpool] DEBUG: http://localhost:55679 "DELETE /session/7e6b8645b8db95a8375f8edb462d2d5a HTTP/1.1" 200 14
2022-02-25 01:53:11 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2022-02-25 01:53:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.meenaclick.com/product/1045/deshi-kachki-dry-fish-100-gm> (referer: None)
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/428/chapa-dry-fish>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/781/deshi-ghoinna-dry-fish-200-gm>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.meenaclick.com/product/1046/kachki-fish-small> (referer: None)
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/867/grass-carp-fish>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/910/hilsha-fish-1-1199-kgpc>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/1045/deshi-kachki-dry-fish-100-gm>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.meenaclick.com/product/1046/kachki-fish-small>
{'productname': None, 'productphoto': None, 'productprice': None}
2022-02-25 01:53:13 [scrapy.core.engine] INFO: Closing spider (finished)
2022-02-25 01:53:13 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10389,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 94095,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 14.032411,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 2, 24, 19, 53, 13, 943186),
'httpcompression/response_bytes': 177776,
'httpcompression/response_count': 41,
'item_scraped_count': 40,
'log_count/DEBUG': 226,
'log_count/INFO': 10,
'response_received_count': 41,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 40,
'scheduler/dequeued/memory': 40,
'scheduler/enqueued': 40,
'scheduler/enqueued/memory': 40,
'start_time': datetime.datetime(2022, 2, 24, 19, 52, 59, 910775)}
2022-02-25 01:53:13 [scrapy.core.engine] INFO: Spider closed (finished)
(bestenv) D:\BestDealScraping\bestdealscraper>
Solution 1:[1]
This page uses JavaScript to add elements on all pages but you use Chrome only to load first page - and other pages you get using normal Request which doesn't use Chrome so it doesn't run JavaScript and it can't get items.
So you would have to create own function which use Chrome instead of Request to get other pages.
Or you could use module scrapy_selenium which gives SeleniumRequest
Full working code with SeleniumRequest
You can put all code in one file and run python script.py without creating project.
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class ProductSpider(scrapy.Spider): # PEP8: `CamelCaseNames` for classes
name = 'products'
def start_requests(self):
yield SeleniumRequest(
url='https://www.meenaclick.com/category/fish',
wait_time=15,
wait_until=EC.element_to_be_clickable((By.XPATH, '//a[@class="main-link ng-star-inserted"]')),
#screenshot=True,
callback=self.parse_main_page
)
def parse_main_page(self, response):
print('[parse_main_page] url:', response.url)
all_items = response.xpath('//a[@class="main-link ng-star-inserted"]')
print('[parse_main_page] len(all_items):', len(all_items))
for number, item in enumerate(all_items, 1):
href = item.attrib.get('href') # it gives relative url
url = response.urljoin(href) # it converts it to absolute url
print(number, '-->', url)
yield SeleniumRequest( # SeleniumRequest needs absolute url
url=url,
wait_time=15,
wait_until=EC.presence_of_element_located((By.XPATH, '//h1[@class="title"]')),
#screenshot=True,
#callback=self.parse
)
def parse(self, response):
print('[parse] url:', response.url)
product_name = response.xpath('//h1[@class="title"]/text()').get('').strip() # needs /text()
product_price = response.xpath('//div[@class="reg-price"]/text()').get('').strip() # needs /text()
product_photo = response.xpath('//img[@class="main-img p-md rounded-s"]/@src').get() #
item = {
#'url': response.url,
'productname': product_name,
'productprice': product_price,
'productphoto': product_photo,
}
print('[parse] item:', item)
yield item
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
#'SELENIUM_DRIVER_NAME': 'chrome',
#'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/chromedriver',
#'SELENIUM_DRIVER_EXECUTABLE_PATH': r'C:\Users\Technbit\Downloads\chromedriver_win32\chromedriver'
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',
'SELENIUM_DRIVER_ARGUMENTS': ['-headless'],
#'SELENIUM_DRIVER_ARGUMENTS': [], # you can't skip this option - but it can be empty list
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800}
})
c.crawl(ProductSpider)
c.start()
Result (file output.csv)
productname,productprice,productphoto
PANGAS FISH POND (3-4.999) KG/PC,tk220.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/49eb2904-e0c9-4880-9ad5-9c0e58eea100.lg.jpg
Thangua Icha (Hawor),"tk1,120.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/sep/23/11/8f57392c-c91a-47c2-91ac-668a7e2b3de7.lg.jpg
Salmon Whole (Norway),"tk1,990.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/oct/25/12/ba0881e2-4bf1-4ffb-bd85-860dc345528a.lg.jpg
BATA FISH,tk495.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/1/12/d5e4426b-3d28-4a41-844f-ce03be85a804.lg.jpg
SILVER CARP FISH,tk290.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/1/12/8566394e-2816-417d-8676-1486ca71ed04.lg.jpg
BOAL FISH 5KG+ (HAWOR),,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/2/17/6fc86b21-558e-4170-9f4f-d4d759778d2f.lg.jpg
BELE FISH SMALL (HAWOR),,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/a6b83377-c348-4972-a3f8-f7df3041bf44.lg.jpg
RITHA FISH SEA,tk395.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/ba6ab821-b081-45da-b258-bff5734f263d.lg.jpg
BOAL FISH DESHI (1.2-2.999) KG/PC,,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/cf3959f2-5132-4857-936e-27a45b2b117d.lg.jpg
AYER FISH DESHI (2-2.999) KG/PC,,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/0c40a885-b55f-4638-acce-412f0efae8ca.lg.jpg
AYER FISH DESHI (1-1.999) KG/PC,,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/ff90aba2-7c26-4915-ade5-830181ec0f45.lg.jpg
MREGEL FISH (3-4.99) KG/PC,tk399.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/nov/3/10/b843a5f6-c513-4445-901e-9c7c0c5429ab.lg.jpg
BG Suri Dry Fish Small 100 gm,tk90.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/56e4cb56-6a6d-4d4a-aaa6-6341fcb1e7e6.lg.jpg
Pabda Fish Pond (30-40) Pcs/Kg,tk370.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/23/17/fec2a6de-573c-458f-ab63-a5ef79fa827f.lg.jpg
Chingri Golda (31-45) Pcs/Kg,tk700.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/nov/8/14/cec3bb27-9fed-45e9-aa19-47711b375d16.lg.jpg
Deshi Loitta Dry Fish 100 Gm,tk180.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/aug/31/19/4ffd04a2-d3f7-47e3-8063-bd58ea6edb48.lg.jpg
Mregel Fish (1-2.99) Kg/Pc,tk315.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/sep/1/22/3b2f095a-41fb-4631-9c4d-0cdbb8ea406e.lg.jpg
Shoal Fish (Hawor),tk595.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/jun/3/13/df0e5621-20f2-46c6-b880-2fa601d641d7.lg.jpg
Deshi Chingri Dry Fish Small 100 Gm,tk150.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2021/oct/12/21/07e6e413-42b1-44cb-82ed-50a14b8b9069.lg.jpg
AYER FISH MEDIUM (HAWOR),"tk1,150.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/dec/31/16/63350218-d378-41f2-b15f-208343958c2a.lg.jpg
KALI BAUSH FISH MEDIUM (HAWOR),tk550.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/dec/31/16/97b1bae3-60be-497b-bc9d-9897cfcceca1.lg.jpg
KARFU FISH BIG (HAWOR),tk360.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/dec/31/16/7808cdd8-88ca-4524-a3ce-af8e2fb51493.lg.jpg
AYER FISH BIG (HAWOR),"tk1,320.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/dec/31/17/3d6affcd-92f9-48c9-a4b0-396f7c399a49.lg.jpg
Deshi Kachki Dry Fish 100 Gm,tk180.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/may/7/23/7b30a659-c378-4af0-9b04-5a322e3a9bf8.lg.jpg
Hilsha Fish (1-1.199) Kg/Pc,"tk1,550.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/may/7/16/383ade79-4cae-4022-9287-eb6c9af141cc.lg.jpg
Grass Carp Fish,tk290.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/apr/28/17/f084e673-9d45-493c-bb60-4e5b77eab005.lg.jpg
Deshi Ghoinna Dry Fish 200 Gm,tk380.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/may/7/19/cfa45179-372d-4cfb-a48c-560434e9d873.lg.jpg
Chapa Dry Fish,"tk1,350.00",https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/may/16/17/2ae13004-67d5-49f2-a8a6-d615142e6904.lg.jpg
Boal Fish Medium (Hawor),tk720.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/apr/28/17/333a4fb6-8fb9-4464-8209-18b201c71429.lg.jpg
Boal Fish (1.2-2.999) Kg/Pc,tk540.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/apr/28/17/900bfcb0-52d0-4fb4-a47c-4b6bc53c7255.lg.jpg
Batashi Fish (Hawor),tk870.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/apr/28/17/5299fac3-8806-4fb9-8512-398c1432b6fc.lg.jpg
BG Loitta Dry Fish 125 gm,tk210.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/10/23/29c855d8-2bac-4b02-bb6c-e91516572cff.lg.jpg
BG Suri Dry Fish Big 125 gm,tk450.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/10/23/cb0d06c8-b577-481d-9ff1-18bc533910a4.lg.jpg
BG Suri Dry Fish Medium 125 gm,tk325.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/0a6f2bbf-1cea-4d6e-99cb-b3b1eb8c167a.lg.jpg
BG Rupchanda Dry Fish Big 125 gm,tk700.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/cb488d53-f2de-459a-9ff0-9dca170fb2fb.lg.jpg
BG Rupchanda Dry Fish Medium 125 gm,tk600.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/b495beba-62ba-4d65-a287-d6d7433290e6.lg.jpg
BG Mola Dry Fish 100 gm,tk150.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/78f66df2-1cf0-483f-b408-533d579ccc5a.lg.jpg
BG Batashi Dry Fish 70 gm,tk160.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/9e098ca2-aff6-4102-9285-ceb74d3526e6.lg.jpg
BG Surma Dry Fish 100 gm,tk520.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/jul/11/0/5697f660-c38f-4ab7-b5b9-db41309e8f44.lg.jpg
Kachki Fish Small,tk520.00,https://d3o0ecz6febi2h.cloudfront.net/product_variances/images/2020/apr/28/17/85dacce4-1e75-44d6-910e-67a663baed89.lg.jpg
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
