'Web Scraping: multiple Linkedin sites

I am trying to scrape 100 company names,location and overview using selenium and beautifulSoup from linkedin. The script runs as follows:

Login to Linkedin -> Opens a company's about page -> Loads the entire page -> Extracts information using beautifulSoup -> Switches tab for carrying out the tasks mentioned before for other links. As it is an iterative process, after scraping 20-25 links linkedin enables a security check/reCaptcha. This terminates the process.

Is there any way to login once and simultaneously scrape through all the links? OR any way to scrape through all the links bypassing security check

Script:-

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
''' Extracts the company name, location, and details from the given url using Web Scrapping from LinkedIN'''
class Extractor :
 def __init__(self,uid=None,pwd=None,url=None) -> None:
    
    
    # removes warnings due to bluetooth and usb drivers
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-notifications")
    experimentalFlags = ['same-site-by-default-cookies@1','cookies-without-same-site-must-be-secure@1']
    chromeLocalStatePrefs = { 'browser.enabled_labs_experiments' : experimentalFlags}
    chrome_options.add_experimental_option('localState',chromeLocalStatePrefs)
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    self.driver=webdriver.Chrome(executable_path=chromedrive_path,options=chrome_options)
    self.driver.get("https://www.linkedin.com/uas/login")  
    #WebDriverWait(self.driver,5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"#session_key")))
    username = self.driver.find_element_by_id("username")
    username.send_keys(uid)  # Enter Your Email Address
    pword = self.driver.find_element_by_id("password")
    pword.send_keys(pwd)        # Enter Your Password "Pagerank290_"
    self.driver.find_element_by_xpath("//button[@type='submit']").click()
    time.sleep(4)
    WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.ID, "global-nav")))
 
    print("Login Successful.")
    # self.driver.maximize_window()
    # self.driver.delete_all_cookies()
    
    # self.driver.get("https://linkedin.com/uas/login")
    # time.sleep(5)

    
    # self.driver.find_element_by_xpath("//button[@type='submit']").click()
    self.driver.get(url)
    self.soup=''
    

def scroll(self):  # scrolls down the page to load entire page
    start = time.time()

     # will be used in the while loop
    initialScroll = 0
    finalScroll = 1000

    while True:
        self.driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
        # this command scrolls the window starting from
        # the pixel value stored in the initialScroll
        # variable to the pixel value stored at the
        # finalScroll variable
        initialScroll = finalScroll
        finalScroll += 1000

        # we will stop the script for 3 seconds so that
        # the data can load
        time.sleep(3)
        # You can change it as per your needs and internet speed

        end = time.time()

        
        # We will scroll for 20 seconds.
        # You can change it as per your needs and internet speed
        if round(end - start) > 20:
            src = self.driver.page_source
            self.soup = BeautifulSoup(src, 'html.parser')
            break

def get_intro(self):  # exrtracts info from the intro section
    intro = self.soup.find('div', {'class': 'block mt2'})  
    #print(intro)
    company_name_loc= intro.find('span',{'dir':'ltr'})        #, {'class': 't-24 t-black t-bold'})
    company_name = company_name_loc.get_text().strip()

    company_loc_inline_block = intro.find('div', {'class': 'inline-block'})
    company_loc_loc = company_loc_inline_block.find('div', {'class': 'org-top-card-summary-info-list__info-item'})
    company_loc = company_loc_loc.get_text().strip()

    return company_name,company_loc

def get_overview_details(self):    # extracts info from overview section
    overview= self.soup.find('dl', {'class': 'overflow-hidden'})
    overview_details1 = overview.find_all('dd', {'class': 'mb4 text-body-small t-black--light'})
    details1,details2=[],[]
    overview_details2 = overview.find_all('dd', {'class': 'text-body-small t-black--light mb1'})

    for detail in overview_details1:
        details1.append(detail.get_text().strip())
    for detail in overview_details2:
        details2.append(detail.get_text().strip())
    
    return details1,details2



def close_page(self):
    self.driver.close()


'''urls is a list containing linkedin hyperlinks of different companies'''
def switch_tab(urls):
    for posts in range(len(urls)):
        #print(posts)      
        if(posts!=len(urls)):
            obj = Extractor(email,password,urls[posts])
            obj.scroll()
            try:
                comp_name,comp_loc=obj.get_intro()
                det1,det2=obj.get_overview_details()
                
                #Tab switching 
                obj.driver.execute_script("window.open('');")
                chwd = obj.driver.window_handles
                obj.driver.switch_to.window(chwd[-1])
            except Exception as e:
                print(f' "{e}" exception occurred in {urls[posts]}')
                continue

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'Web Scraping: multiple Linkedin sites

Sources

Related Questions