'Does scrapy User-agent rotator change HTML-response depending on selected browser type?
I am trying to scrape data such as price and some labels etc. from amazon using scrapy. I try to find elements by xpath or css and it works always fine when I use scrapy shell. However, when I run the spider script with the xpath and css selectors I previously used in the shell, it sometimes finds them and sometimes not. I am using a user-agent rotator and I am thinking that depending on the selected user-agent I get a different html response. Is that possible? Also, some elements are found reliably such as the asin number and the description of a product. However, I quite often I cannot find 'price' or 'coupon' etc (in parse_items in the code below). Is it possible that values such as price and labels are loaded dynamically and I cannot find them because of that? Thanks for any help! Cheers
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
domain = 'https://www.amazon.com'
start_urls = ['https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2']
#custom_settings = {"FEEDS": {"results.csv": {"format": "csv"}}}
#HEADERS = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
cookies = {
'x-amz-captcha-1': '1638467197733195',
'x-amz-captcha-2': 'G7CLrZ2egNu35VgwLsWwow==',
'session-id': '147-6393456-8775946',
'session-id-time': '2082787201l',
'i18n-prefs': 'USD',
'csm-hit': 'adb:adblk_yes&t:1639009238193&tb:SPK7B1BF11S9XZAF920T+s-C1CZBBJEEFFB9J81ZF4C|1639009238193',
'ubid-main': '135-1646028-7160534',
'session-token': 'eZgavFFEE1L6SEh1T+ZM2yoC6st+y173l4E3vGve6MhYvdFQtc1kmHmQBl3GZnmPpnJXkzBjtJyIBjc/Hj4pzBDQ8tlKcx+Szh25Qcs1f04SNyg+Zk40rh0BwEBJctr1RKFEOji1r682vGvAnVuKHNkecSHna8extW1wN0hVHUhRWNGt1R/4ZWhTRMuTWgEU4hALpF7cbNUmiGb8bfE7VA==',
'lc-main': 'en_US',
'skin': 'noskin',
}
def start_requests(self):
#url='https://www.amazon.com/Breathing-High-Precision-Adjustable-Programmable-Ergonomic/dp/B09NBV52FS/ref=sr_1_3?crid=UU1M4K2JHQ37&keywords=gaming%2Bmouse&qid=1651606012&sprefix=gaming%2Bmouse%2Caps%2C173&sr=8-3&th=1'
#url = 'https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2'
for url in self.start_urls:
##use splashrequest as there is possibly some javascript code in the website code that needs to be processed with splash (e.g. product description)
rand_wait = random.randint(10,20)
print('random waiting time:',rand_wait)
yield SplashRequest(url=url,callback=self.parse,cookies=self.cookies, args={"wait":rand_wait})
yield SplashRequest(url=url, callback=self.parse_items, cookies=self.cookies, args={"wait": rand_wait}, dont_filter = True)
#yield scrapy.Request(url=url, callback=self.parse, cookies=self.cookies)
#yield scrapy.Request(url=url, callback=self.parse_items, cookies=self.cookies, dont_filter=True)
#headers=self.HEADERS
def parse(self, response):
url = 'https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2'
if response:
Link = response.css('.a-text-normal').css('a::attr(href)').extract()
Title = response.css('span.a-text-normal').css('::text').extract()
items_general = {}
for result in zip(Link, Title):
items_general['Title Product'] = result[1]
items_general['Link Product'] = result[0]
# extract ASIN from link
try:
ASIN = re.findall(r"(?<=dp/)[A-Z0-9]{10}", result[0])[0]
items_general['ASIN_Product'] = ASIN
except IndexError:
print('No asin match for', result[0])
df = pd.DataFrame.from_dict(items_general, orient='index')
df = df.transpose()
print(items_general)
yield df.to_csv(r'amazon_general.txt', index=False, header=True, sep=',')
def parse_items(self, response):
product = response.css('div[data-asin="B09QS2GS2H"]')
print('Product HTML is this:', product)
'''
items ={}
soup = BeautifulSoup(response.text, 'lxml')
product = soup.find('div', {'data-asin':'B09QS2GS2H'})
print('this is the html of the product',product)
df = pd.DataFrame.from_dict(items, orient='index')
df = df.transpose()
print(items)
yield df.to_csv(r'amazon.txt', index=False, header=True, sep=',')
'''
if product:
items ={}
for data in product:
items['Datetime'] = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
items['URL'] = response.url
items['Description'] = data.xpath(
'.//*[contains(concat( " ", @class, " " ), '
'concat( " ", "a-color-base", " " )) and contains(concat( " ", @class, " " ), '
'concat( " ", "a-text-normal", " " ))]/text()').extract()
# data.css('span.a-size-medium.a-color-base.a-text-normal::text').extract()
if data.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-color-secondary", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "a-color-secondary", " " ))]//text()'):
items['Options'] = 1
else:
items['Options'] = 0
items['Price'] = data.css('span.a-offscreen::text').extract()
#items['Price'] = soup.find('span',{'class':'a-offscreen'}).text
if data.xpath('.//*[contains(concat( " ", @class, " " ), '
'concat( " ", "a-text-price", " " ))]//span/text()'):
items['non-discounted Price'] = data.xpath('.//*[contains(concat( " ", @class, " " ), '
'concat( " ", "a-text-price", " " ))]//span/text()').extract_first()
else:
items['non-discounted Price'] = 'NA'
if data.xpath('.//*[contains(concat( " ", @class, " " ), concat( " ", "s-coupon-highlight-color", " " ))]//text()'):
items['Save x% (or x$) with coupon'] = 1
coupon = data.xpath('.//*[contains(concat( " ", @class, " " ), concat( " ", "s-coupon-highlight-color", " " ))]//text()').extract()
discount_dollar = [item.replace("$", "") for item in coupon]
discount_percentage = [item.replace("%", "") for item in coupon]
if discount_dollar == coupon:
items['Save x$ with coupon'] = 0
else:
items['Save x$ with coupon'] = [item.replace("Save $", "") for item in coupon]
if discount_percentage == coupon:
items['Save x% with coupon'] = 0
else:
percentage = [item.replace("Save", "") for item in coupon]
percentage = [item.replace("%", "") for item in percentage]
items['Save x% with coupon'] = percentage
else:
items['Save x% (or x$) with coupon'] = 0
items['Save x$ with coupon'] = 0
items['Save x% with coupon'] = 0
if data.xpath('//*[@id="B09QS2GS2H-amazons-choice-label"]/span//text()'):
items['Amazons choice'] = 1
else:
items['Amazons choice'] = 0
if data.xpath('//*[@id="B09QS2GS2H-best-seller-label"]/span//text()'):
items['Best Seller'] = 1
else:
items['Best Seller'] = 0
df = pd.DataFrame.from_dict(items, orient='index')
df = df.transpose()
print(items)
yield df.to_csv(r'amazon.txt', index = False, header=True, sep=',')
'''
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|