'How to optimize my selenium code scrapping

I have created a code to scrape data from a e commerce website using selenium. My code parse only the all page products and I noticed that takes too much times. For example, I tested my code on 4 pages and took me arround 62 sec to execute my code, which is remarquable for the user if I tested on several pages.

I have tested tips to optimize selenium that I have found but still the same problem. My question is :

Is there other tips to optimize selenium an d un my code rapidly?

Here is my code :

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 5):
    print('---', page_nb, '---')    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    sleep(3)
    tree = html.fromstring(driver.page_source)
    results = []
    for product in tree.xpath('//div[@class="JIIxO"]//a'):
        title = product.xpath('.//h1/text()')
        if title:
            title = title[0]
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@class="eXPaM"]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
            nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@class="ox0KZ"]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'
            ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl) + str( product_links[0])
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
    # driver.close()-------Remove this code so driver is open and can open URL
    df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
    client = MongoClient("mongodb://localhost:27017/")     
    collection = client['db2']['aliex2']     
    data = df.to_dict(orient = 'records')     
    collection.insert_many(data)
     
print("--- %s seconds ---" % (time.time() - start_time))
driver.quit()

I would be so grateful for any help from you. Thank you.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source