'How to scrape glassdoor salary data with beautifulsoup and selenium

I am trying to scrape salary data from glassdoor website. However, the problem is that during the process the data is not collected for some reasons. And I am not sure where the error is.

email = "" # your email here
password = "" # your password here

# Manual options for the city, num pages to scrape, and URL
pages = 700
cityName = "United-Kingdom"
cityURL = "https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searchBtn"

def obj_dict(obj):
    return obj.__dict__
#enddef

def json_export(data):
    jsonFile = open(cityName + ".json", "w")
    jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
    jsonFile.close()
#enddef

def init_driver():
    driver = webdriver.Chrome("C:/Users/mm/Desktop/glassdoor/chromedriver.exe")
    driver.wait = WebDriverWait(driver, 10)
    return driver
#enddef

def login(driver, email, password):
    driver.get("http://www.glassdoor.com/profile/login_input.htm")
    try:
        user_field = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "email")))
        pw_field = driver.find_element_by_class_name("signin-password")
        login_button = driver.find_element_by_id("signInBtn")
        user_field.send_keys(email)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(password)
        time.sleep(1)
        login_button.click()
    except TimeoutException:
        print("TimeoutException! Email/password field or login button not found on glassdoor.com")
#enddef

def parse_salaries_HTML(salaries, data):
    for salary in salaries:
        jobTitle = "-"
        company = "-"
        meanPay = "-"
        jobTitle = salary.find("a", { "class" : "jobTitle"}).getText().strip()
        company = salary.find("div", { "class" : "i-emp"}).getText().strip()
        try:
            meanPay = salary.find("div", { "class" : "meanPay"}).find("strong").getText().strip()
        except:
            meanPay = 'xxx'
        r = Salary.Salary(jobTitle, company, meanPay)
        data.append(r)
    return data
#enddef

def get_data(driver, URL, startPage, endPage, data, refresh):
    if (startPage > endPage):
        return data
    #endif
    print ("\nPage " + str(startPage) + " of " + str(endPage))
    currentURL = URL + "_IP" + str(startPage) + ".htm"
    time.sleep(2)
    #endif
    if (refresh):
        driver.get(currentURL)
        print ("Getting " + currentURL)
    #endif
    time.sleep(2)
    HTML = driver.page_source
    soup = BeautifulSoup(HTML, "html.parser")
    salaries = soup.find("div", { "class" : ["salaryChartModule"] })
    if salaries is not None:
        salaries = salaries.find_all("div", { "class" : ["salaryRow"] }) 
    # Process futher
    if (salaries):
        data = parse_salaries_HTML(salaries, data)
        print ("Page " + str(startPage) + " scraped.")
        if (startPage % 10 == 0):
            print ("\nTaking a breather for a few seconds ...")
            time.sleep(10)
        #endif
        get_data(driver, URL, startPage + 1, endPage, data, True)
    else:
        print ("Waiting ... page still loading or CAPTCHA input required")
        time.sleep(3)
        get_data(driver, URL, startPage, endPage, data, False)
    #endif
    return data
#enddef

if __name__ == "__main__":
    driver = init_driver()
    time.sleep(3)
    print ("Logging into Glassdoor account ...")
    login(driver, email, password)
    time.sleep(10)
    print ("\nStarting data scraping ...")
    data = get_data(driver, cityURL[:-4], 1, pages, [], True)
    print ("\nExporting data to " + cityName + ".json")
    json_export(data)
    driver.quit()
#endif

After running the code the Glassdoor page is opening and logging in my account. After that I get the following result:

"Logging into Glassdoor account ... TimeoutException! Email/password field or login button not found on glassdoor.com

Starting data scraping ...

Page 1 of 700 Getting https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searc_IP1.htm Waiting ... page still loading or CAPTCHA input required

Page 1 of 700 Waiting ... page still loading or CAPTCHA input required"

So the problem is that it never gets any data from the website and never goes to the next page. I just keep getting last two lined of the result over and over. I would really appreciate if someone could tell me where the error is and how to fix that.

Solution 1:^[1]

Your error shows that it has problem to login:

"Logging into Glassdoor account ... TimeoutException! 
Email/password field or login button not found on glassdoor.com

You simply search wrong elements on in login form.

There is no name="email" but name="username" (or id="inlineUserEmail").

There is no class="signin-password" but name="password" (or id="inlineUserPassword")

There is no name="signInBtn" but name="submit"

#user_field = driver.wait.until(EC.presence_of_element_located(
#    (By.NAME, "email")))  # <-- wrong
user_field = driver.wait.until(EC.presence_of_element_located(
    (By.NAME, "username")))

#pw_field = driver.find_element_by_class_name("signin-password")  # <-- wrong
#pw_field = driver.find_element_by_id("inlineUserPassword")
pw_field = driver.find_element_by_name("password")

#login_button = driver.find_element_by_id("signInBtn")  # <-- wrong
login_button = driver.find_element_by_name("submit")

EDIT:

After login you may use also wrong values to get data - but I don't have login/password access it and to check if you use correct values.

And if you can see data without login then you could remove code for login and show what is the real problem.

It seems you use totally wrong elements.
There is no salaryChartModule, and there is no salaryRow.

You should delete this part and start from the beginning.

And it is wrong idea to run get_data() inside get_data() - it make recursion and it may use more memory, and it hard to get back to the beginning. It need always return get_data() to send data to the beginning.

To get next page you can do

driver.find_element_by_xpath('//button[@label="Next"]').click()

but it would need to run all code in some loop while True to repeate it for next pages. And you would need try/except to catch error when it can't click link to next page, and use break to exit loop

while True:
    # ... code for scraping page ...

    try:
        driver.find_element_by_xpath('//button[@label="Next"]').click()
    except Excception as ex:
        print('Excception:', ex)
        break

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1

'How to scrape glassdoor salary data with beautifulsoup and selenium

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]