'Python web scraping indeed, BeatifulSoup, unpredictable results

Here I have a web scraper using Python and BeautifulSoup that scrapes indeed and gathers job titles, companies, and salary data. The salary data can be all sorts of different entries on indeed ie "5.2k per month" or "100,000 per year" etc. Thus I have a function to change this data, regardless of the format, to an actual salary (yearly) integer like '100000'. Some jobs have no salary data listed in the salary portion and for those jobs I am returning 0.

The problem I am running into is when a job has no salary data listed, sometimes its salary data gets logged as the previous jobs salary data. This does not happen every time and seems to have varrying results with how often it happens and on which jobs, some jobs with no salary data log perfectly fine as 0.

Here is an example of dictionary output with the problem from when I scraped two pages of jobs for Network Architects and Carpenters, here at the bottom is one example of the problem:

{'title': 'new Carpenter', 'company': 'County of Orange', 'salary': 67694, 'href': 'https://ca.indeed.com/viewjob?jk=0d2ce3b674852afa', 'id': 'job_0d2ce3b674852afa'}, {'title': 'new Carpenter', 'company': 'TIC', 'salary': 67694, 'href': 'https://ca.indeed.com/viewjob?jk=73f24cf56510c78a', 'id': 'job_73f24cf56510c78a'},

I have the links pulled from the site to verify if anywhere there is salary data listed aswell.

This is also my first python and webscraping project so if there is a better way to do this entirely that would also solve the issue then that would suffice as well.

Code:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
       
   


def extract(page, job):
    headers = headers
    url = f'https://www.indeed.com/jobs?q={job}&start={page}&vjk=e8bcf3fbe7498a5f'
    r = requests.get(url,headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup



# Turn any type of salary data into one type
def filter_salary(x, knumber, number, first_list):
    if x[-1] == 'K':
        y = list(x)
        y[-1] = ''
        z = ''.join(y)
        z1 = re.sub(r'[$]', '', z)
        z2 = int(float(z1)) * knumber 
        first_list.append(z2)


    else:
        y = list(x)
        z = ''.join(y)
        z1 = re.sub(r'[$]', '', z)
        z2 = int(float(z1)) * number
        first_list.append(z2)

        



def transform(soup):
   
    #get job title company and salary data from inded
    for job in soup.select('.result'):
        title = job.select_one('.jobTitle').get_text(' ')
        company = job.find(class_='companyName').text
        id1 = job.get('id')
        
        try:
            salary = job.find(class_='attribute_snippet').text
        except:
            salary = 0
            
        print(f"title: {title}")
        print(f"company: {company}")
        
                   
       #filter out salary data and return in salary ie $100 an hour ->
        if salary != 0:
            salary2 = re.sub(',', '', salary)
            if 'an hour'in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 2080000, 2080, first_list)
                    

            elif 'a day' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 365000, 365, first_list)
                   

            elif 'a week' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 52000, 52, first_list)
                   

            elif 'a month' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 12000, 12, first_list)
                   
            else:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 1000, 1, first_list)
                   
                  

            # Average the salary range if two are give ie 10-20 becomes one int 15        
            if len(first_list) == 2:
                
                res = int(float((first_list[0] + first_list[1]) / 2))
                
    
            elif len(first_list) == 1:
                
                res = int(float((first_list[0])))

                
            elif len(first_list) == 0 or None:
                
                res = 0
                
                
            else:
                
                res = 0
           
               
           

               
        #sleep to avoid to many url requests per second
        time.sleep(1)
        
        
        jobLinkUrl = f'https://ca.indeed.com/viewjob?jk={job["data-jk"]}'


        job = {
            'title': title,
            'company': company,
            'salary': res,
            'href': jobLinkUrl,
            'id': id1
        }
        joblist.append(job)
       
       
    return

   
       
joblist = []


#search 2 pages of each Network Architect and Carpenter
for i in range(0,20,10):
    print(f'Getting page, {i}')
    jobSearchList = ['Network%20Architect', 'Carpenter']
    for job in jobSearchList:
        c = extract(0, job)
        transform(c)
print(joblist)
print('done')


Solution 1:[1]

      if salary != 0 and != [] and != None:
            salary2 = re.sub(',', '', salary)
            if 'an hour'in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 2080000, 2080, first_list)
                    

            elif 'a day' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 365000, 365, first_list)
                   

            elif 'a week' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 52000, 52, first_list)
                   

            elif 'a month' in salary2:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 12000, 12, first_list)
                   
            else:
                salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
                first_list=[]
                for x in salary3:
                    filter_salary(x, 1000, 1, first_list)
     else:
          res:0

              

The additional else: res = 0 fixed the issue, but also added the and != [] and != None for good measure.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Caleb