'Python web scraping indeed, BeatifulSoup, unpredictable results
Here I have a web scraper using Python and BeautifulSoup that scrapes indeed and gathers job titles, companies, and salary data. The salary data can be all sorts of different entries on indeed ie "5.2k per month" or "100,000 per year" etc. Thus I have a function to change this data, regardless of the format, to an actual salary (yearly) integer like '100000'. Some jobs have no salary data listed in the salary portion and for those jobs I am returning 0.
The problem I am running into is when a job has no salary data listed, sometimes its salary data gets logged as the previous jobs salary data. This does not happen every time and seems to have varrying results with how often it happens and on which jobs, some jobs with no salary data log perfectly fine as 0.
Here is an example of dictionary output with the problem from when I scraped two pages of jobs for Network Architects and Carpenters, here at the bottom is one example of the problem:
{'title': 'new Carpenter', 'company': 'County of Orange', 'salary': 67694, 'href': 'https://ca.indeed.com/viewjob?jk=0d2ce3b674852afa', 'id': 'job_0d2ce3b674852afa'}, {'title': 'new Carpenter', 'company': 'TIC', 'salary': 67694, 'href': 'https://ca.indeed.com/viewjob?jk=73f24cf56510c78a', 'id': 'job_73f24cf56510c78a'},
I have the links pulled from the site to verify if anywhere there is salary data listed aswell.
This is also my first python and webscraping project so if there is a better way to do this entirely that would also solve the issue then that would suffice as well.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
def extract(page, job):
headers = headers
url = f'https://www.indeed.com/jobs?q={job}&start={page}&vjk=e8bcf3fbe7498a5f'
r = requests.get(url,headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# Turn any type of salary data into one type
def filter_salary(x, knumber, number, first_list):
if x[-1] == 'K':
y = list(x)
y[-1] = ''
z = ''.join(y)
z1 = re.sub(r'[$]', '', z)
z2 = int(float(z1)) * knumber
first_list.append(z2)
else:
y = list(x)
z = ''.join(y)
z1 = re.sub(r'[$]', '', z)
z2 = int(float(z1)) * number
first_list.append(z2)
def transform(soup):
#get job title company and salary data from inded
for job in soup.select('.result'):
title = job.select_one('.jobTitle').get_text(' ')
company = job.find(class_='companyName').text
id1 = job.get('id')
try:
salary = job.find(class_='attribute_snippet').text
except:
salary = 0
print(f"title: {title}")
print(f"company: {company}")
#filter out salary data and return in salary ie $100 an hour ->
if salary != 0:
salary2 = re.sub(',', '', salary)
if 'an hour'in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 2080000, 2080, first_list)
elif 'a day' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 365000, 365, first_list)
elif 'a week' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 52000, 52, first_list)
elif 'a month' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 12000, 12, first_list)
else:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 1000, 1, first_list)
# Average the salary range if two are give ie 10-20 becomes one int 15
if len(first_list) == 2:
res = int(float((first_list[0] + first_list[1]) / 2))
elif len(first_list) == 1:
res = int(float((first_list[0])))
elif len(first_list) == 0 or None:
res = 0
else:
res = 0
#sleep to avoid to many url requests per second
time.sleep(1)
jobLinkUrl = f'https://ca.indeed.com/viewjob?jk={job["data-jk"]}'
job = {
'title': title,
'company': company,
'salary': res,
'href': jobLinkUrl,
'id': id1
}
joblist.append(job)
return
joblist = []
#search 2 pages of each Network Architect and Carpenter
for i in range(0,20,10):
print(f'Getting page, {i}')
jobSearchList = ['Network%20Architect', 'Carpenter']
for job in jobSearchList:
c = extract(0, job)
transform(c)
print(joblist)
print('done')
Solution 1:[1]
if salary != 0 and != [] and != None:
salary2 = re.sub(',', '', salary)
if 'an hour'in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 2080000, 2080, first_list)
elif 'a day' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 365000, 365, first_list)
elif 'a week' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 52000, 52, first_list)
elif 'a month' in salary2:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 12000, 12, first_list)
else:
salary3 = re.findall('\$\d+(?:\.\d+)?K?', salary2)
first_list=[]
for x in salary3:
filter_salary(x, 1000, 1, first_list)
else:
res:0
The additional else: res = 0 fixed the issue, but also added the and != [] and != None for good measure.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Caleb |
