'Web Scraping: multiple Linkedin sites
I am trying to scrape 100 company names,location and overview using selenium and beautifulSoup from linkedin. The script runs as follows:
Login to Linkedin -> Opens a company's about page -> Loads the entire page -> Extracts information using beautifulSoup -> Switches tab for carrying out the tasks mentioned before for other links. As it is an iterative process, after scraping 20-25 links linkedin enables a security check/reCaptcha. This terminates the process.
Is there any way to login once and simultaneously scrape through all the links? OR any way to scrape through all the links bypassing security check
Script:-
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
''' Extracts the company name, location, and details from the given url using Web Scrapping from LinkedIN'''
class Extractor :
def __init__(self,uid=None,pwd=None,url=None) -> None:
# removes warnings due to bluetooth and usb drivers
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-notifications")
experimentalFlags = ['same-site-by-default-cookies@1','cookies-without-same-site-must-be-secure@1']
chromeLocalStatePrefs = { 'browser.enabled_labs_experiments' : experimentalFlags}
chrome_options.add_experimental_option('localState',chromeLocalStatePrefs)
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver=webdriver.Chrome(executable_path=chromedrive_path,options=chrome_options)
self.driver.get("https://www.linkedin.com/uas/login")
#WebDriverWait(self.driver,5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"#session_key")))
username = self.driver.find_element_by_id("username")
username.send_keys(uid) # Enter Your Email Address
pword = self.driver.find_element_by_id("password")
pword.send_keys(pwd) # Enter Your Password "Pagerank290_"
self.driver.find_element_by_xpath("//button[@type='submit']").click()
time.sleep(4)
WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.ID, "global-nav")))
print("Login Successful.")
# self.driver.maximize_window()
# self.driver.delete_all_cookies()
# self.driver.get("https://linkedin.com/uas/login")
# time.sleep(5)
# self.driver.find_element_by_xpath("//button[@type='submit']").click()
self.driver.get(url)
self.soup=''
def scroll(self): # scrolls down the page to load entire page
start = time.time()
# will be used in the while loop
initialScroll = 0
finalScroll = 1000
while True:
self.driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
# this command scrolls the window starting from
# the pixel value stored in the initialScroll
# variable to the pixel value stored at the
# finalScroll variable
initialScroll = finalScroll
finalScroll += 1000
# we will stop the script for 3 seconds so that
# the data can load
time.sleep(3)
# You can change it as per your needs and internet speed
end = time.time()
# We will scroll for 20 seconds.
# You can change it as per your needs and internet speed
if round(end - start) > 20:
src = self.driver.page_source
self.soup = BeautifulSoup(src, 'html.parser')
break
def get_intro(self): # exrtracts info from the intro section
intro = self.soup.find('div', {'class': 'block mt2'})
#print(intro)
company_name_loc= intro.find('span',{'dir':'ltr'}) #, {'class': 't-24 t-black t-bold'})
company_name = company_name_loc.get_text().strip()
company_loc_inline_block = intro.find('div', {'class': 'inline-block'})
company_loc_loc = company_loc_inline_block.find('div', {'class': 'org-top-card-summary-info-list__info-item'})
company_loc = company_loc_loc.get_text().strip()
return company_name,company_loc
def get_overview_details(self): # extracts info from overview section
overview= self.soup.find('dl', {'class': 'overflow-hidden'})
overview_details1 = overview.find_all('dd', {'class': 'mb4 text-body-small t-black--light'})
details1,details2=[],[]
overview_details2 = overview.find_all('dd', {'class': 'text-body-small t-black--light mb1'})
for detail in overview_details1:
details1.append(detail.get_text().strip())
for detail in overview_details2:
details2.append(detail.get_text().strip())
return details1,details2
def close_page(self):
self.driver.close()
'''urls is a list containing linkedin hyperlinks of different companies'''
def switch_tab(urls):
for posts in range(len(urls)):
#print(posts)
if(posts!=len(urls)):
obj = Extractor(email,password,urls[posts])
obj.scroll()
try:
comp_name,comp_loc=obj.get_intro()
det1,det2=obj.get_overview_details()
#Tab switching
obj.driver.execute_script("window.open('');")
chwd = obj.driver.window_handles
obj.driver.switch_to.window(chwd[-1])
except Exception as e:
print(f' "{e}" exception occurred in {urls[posts]}')
continue
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
