'Creating a table through a list for Pandas
I am having a heck of a time turning data that i have into a dataframe through Pandas. I feel like this is far from a difficult task but i can't seem to figure it out. I have the headers i want for the dataframe and i have the data but this is data from the web. I know i need to turn it into a list and then put that into a DataFrame function but i am unable to figure out how to put this thing into a list.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
down_arrow.click()
time.sleep(.5)
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
for R1_player in R1_leaderboard.find_all('tbody'):
R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
for R1_row in R1_rows:
R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
R1_Players = R1_row.find('a').text
R1_Round = R1_row.find_all("select")[1].text
R1_H1 = R1_row.find_all('span')[1].text
R1_H2 = R1_row.find_all('span')[2].text
R1_H3 = R1_row.find_all('span')[3].text
R1_H4 = R1_row.find_all('span')[4].text
R1_H5 = R1_row.find_all('span')[5].text
R1_H6 = R1_row.find_all('span')[6].text
R1_H7 = R1_row.find_all('span')[7].text
R1_H8 = R1_row.find_all('span')[8].text
R1_H9 = R1_row.find_all('span')[9].text
R1_H10 = R1_row.find_all('span')[11].text
R1_H11 = R1_row.find_all('span')[12].text
R1_H12 = R1_row.find_all('span')[13].text
R1_H13 = R1_row.find_all('span')[14].text
R1_H14 = R1_row.find_all('span')[15].text
R1_H15 = R1_row.find_all('span')[16].text
R1_H16 = R1_row.find_all('span')[17].text
R1_H17 = R1_row.find_all('span')[18].text
R1_H18 = R1_row.find_all('span')[19].text
print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)
Solution 1:[1]
here's a modified version of your code. i used pandas.read_html to turn the html table into a dataframe.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
try:
down_arrow.click()
time.sleep(.5)
except:
pass
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
try:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
except:
pass
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]
prettify turns the bs4 object into a normal string that pandas can process. read_html actually returns a list of dataframes - but in this case there's only one, which is why i added [0] on the end. when i run R1_df i get this:
Unnamed: 0 POS PLAYER SCORE R1 R2 R3 R4 TOT EARNINGS FEDEX PTS
0 NaN 1 Scottie Scheffler -10 69 67 71 71 278 $2,700,000 600
1 NaN 2 Rory McIlroy -7 73 73 71 64 281 $1,620,000 330
2 NaN T3 Shane Lowry -5 73 68 73 69 283 $870,000 180
3 NaN T3 Cameron Smith -5 68 74 68 73 283 $870,000 180
4 NaN 5 Collin Morikawa -4 73 70 74 67 284 $600,000 120
... ... ... ... ... ... ... ... ... ... ... ...
86 NaN - Stewart Hagestad (a) CUT 79 81 -- -- 160 -- 0
87 NaN - José María Olazábal CUT 77 84 -- -- 161 -- 0
88 NaN - Laird Shepherd (a) CUT 81 85 -- -- 166 -- 0
89 NaN - Louis Oosthuizen WD 76 -- -- -- 76 -- 0
90 NaN - Paul Casey WD -- -- -- -- -- -- 0
hopefully this is what you were looking for! because pandas can process html directly, there wasn't actually a need to make any intermediary lists.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | eva bacas |
