'Does scrapy User-agent rotator change HTML-response depending on selected browser type?

I am trying to scrape data such as price and some labels etc. from amazon using scrapy. I try to find elements by xpath or css and it works always fine when I use scrapy shell. However, when I run the spider script with the xpath and css selectors I previously used in the shell, it sometimes finds them and sometimes not. I am using a user-agent rotator and I am thinking that depending on the selected user-agent I get a different html response. Is that possible? Also, some elements are found reliably such as the asin number and the description of a product. However, I quite often I cannot find 'price' or 'coupon' etc (in parse_items in the code below). Is it possible that values such as price and labels are loaded dynamically and I cannot find them because of that? Thanks for any help! Cheers

class AmazonSpiderSpider(scrapy.Spider):
    name = 'amazon_spider'
    domain = 'https://www.amazon.com'
    start_urls = ['https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2']
    #custom_settings = {"FEEDS": {"results.csv": {"format": "csv"}}}

    #HEADERS = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}


    cookies = {
    'x-amz-captcha-1': '1638467197733195',
    'x-amz-captcha-2': 'G7CLrZ2egNu35VgwLsWwow==',
    'session-id': '147-6393456-8775946',
    'session-id-time': '2082787201l',
    'i18n-prefs': 'USD',
    'csm-hit': 'adb:adblk_yes&t:1639009238193&tb:SPK7B1BF11S9XZAF920T+s-C1CZBBJEEFFB9J81ZF4C|1639009238193',
    'ubid-main': '135-1646028-7160534',
    'session-token': 'eZgavFFEE1L6SEh1T+ZM2yoC6st+y173l4E3vGve6MhYvdFQtc1kmHmQBl3GZnmPpnJXkzBjtJyIBjc/Hj4pzBDQ8tlKcx+Szh25Qcs1f04SNyg+Zk40rh0BwEBJctr1RKFEOji1r682vGvAnVuKHNkecSHna8extW1wN0hVHUhRWNGt1R/4ZWhTRMuTWgEU4hALpF7cbNUmiGb8bfE7VA==',
    'lc-main': 'en_US',
    'skin': 'noskin',
    }

    def start_requests(self):
        #url='https://www.amazon.com/Breathing-High-Precision-Adjustable-Programmable-Ergonomic/dp/B09NBV52FS/ref=sr_1_3?crid=UU1M4K2JHQ37&keywords=gaming%2Bmouse&qid=1651606012&sprefix=gaming%2Bmouse%2Caps%2C173&sr=8-3&th=1'
        #url = 'https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2'
        for url in self.start_urls:
            ##use splashrequest as there is possibly some javascript code in the website code that needs to be processed with splash (e.g. product description)
            rand_wait = random.randint(10,20)
            print('random waiting time:',rand_wait)

            yield SplashRequest(url=url,callback=self.parse,cookies=self.cookies, args={"wait":rand_wait})
            yield SplashRequest(url=url, callback=self.parse_items, cookies=self.cookies, args={"wait": rand_wait}, dont_filter = True)

            #yield scrapy.Request(url=url, callback=self.parse, cookies=self.cookies)
            #yield scrapy.Request(url=url, callback=self.parse_items, cookies=self.cookies, dont_filter=True)

    #headers=self.HEADERS



    def parse(self, response):
        url = 'https://www.amazon.com/s?k=gaming+mouse&crid=K3DOU2V1SO0X&sprefix=gaming+mouse%2Caps%2C165&ref=nb_sb_noss_2'

        if response:
            Link = response.css('.a-text-normal').css('a::attr(href)').extract()
            Title = response.css('span.a-text-normal').css('::text').extract()
            items_general = {}
            for result in zip(Link, Title):
                items_general['Title Product'] = result[1]
                items_general['Link Product'] = result[0]
                # extract ASIN from link
                try:
                    ASIN = re.findall(r"(?<=dp/)[A-Z0-9]{10}", result[0])[0]
                    items_general['ASIN_Product'] = ASIN
                except IndexError:
                    print('No asin match for', result[0])
                df = pd.DataFrame.from_dict(items_general, orient='index')
                df = df.transpose()
                print(items_general)

                yield df.to_csv(r'amazon_general.txt', index=False, header=True, sep=',')

    def parse_items(self, response):
       
        product = response.css('div[data-asin="B09QS2GS2H"]')
        print('Product HTML is this:', product)
        '''
        items ={}
        soup = BeautifulSoup(response.text, 'lxml')
        product = soup.find('div', {'data-asin':'B09QS2GS2H'})
        print('this is the html of the product',product)

        df = pd.DataFrame.from_dict(items, orient='index')
        df = df.transpose()
        print(items)

        yield df.to_csv(r'amazon.txt', index=False, header=True, sep=',')
        '''

        if product:
            items ={}
            for data in product:
                items['Datetime'] = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
                items['URL'] = response.url
                items['Description'] = data.xpath(
                    './/*[contains(concat( " ", @class, " " ), '
                    'concat( " ", "a-color-base", " " )) and contains(concat( " ", @class, " " ), '
                    'concat( " ", "a-text-normal", " " ))]/text()').extract()
                # data.css('span.a-size-medium.a-color-base.a-text-normal::text').extract()
                if data.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-color-secondary", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "a-color-secondary", " " ))]//text()'):
                    items['Options'] = 1
                else:
                    items['Options'] = 0

                items['Price'] = data.css('span.a-offscreen::text').extract()
                #items['Price'] = soup.find('span',{'class':'a-offscreen'}).text


                if data.xpath('.//*[contains(concat( " ", @class, " " ), '
                              'concat( " ", "a-text-price", " " ))]//span/text()'):
                    items['non-discounted Price'] = data.xpath('.//*[contains(concat( " ", @class, " " ), '
                                                               'concat( " ", "a-text-price", " " ))]//span/text()').extract_first()
                else:
                    items['non-discounted Price'] = 'NA'

                if data.xpath('.//*[contains(concat( " ", @class, " " ), concat( " ", "s-coupon-highlight-color", " " ))]//text()'):
                    items['Save x% (or x$) with coupon'] = 1
                    coupon = data.xpath('.//*[contains(concat( " ", @class, " " ), concat( " ", "s-coupon-highlight-color", " " ))]//text()').extract()
                    discount_dollar = [item.replace("$", "") for item in coupon]
                    discount_percentage = [item.replace("%", "") for item in coupon]
                    if discount_dollar == coupon:
                        items['Save x$ with coupon'] = 0
                    else:
                        items['Save x$ with coupon'] = [item.replace("Save $", "") for item in coupon]
                    if discount_percentage == coupon:
                        items['Save x% with coupon'] = 0
                    else:
                        percentage = [item.replace("Save", "") for item in coupon]
                        percentage = [item.replace("%", "") for item in percentage]
                        items['Save x% with coupon'] = percentage

                else:
                    items['Save x% (or x$) with coupon'] = 0
                    items['Save x$ with coupon'] = 0
                    items['Save x% with coupon'] = 0


                if data.xpath('//*[@id="B09QS2GS2H-amazons-choice-label"]/span//text()'):
                    items['Amazons choice'] = 1
                else:
                    items['Amazons choice'] = 0
                if data.xpath('//*[@id="B09QS2GS2H-best-seller-label"]/span//text()'):
                    items['Best Seller'] = 1
                else:
                    items['Best Seller'] = 0

            df = pd.DataFrame.from_dict(items, orient='index')
            df = df.transpose()
            print(items)

            yield df.to_csv(r'amazon.txt', index = False, header=True, sep=',')

'''


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source