'Creating a table through a list for Pandas

I am having a heck of a time turning data that i have into a dataframe through Pandas. I feel like this is far from a difficult task but i can't seem to figure it out. I have the headers i want for the dataframe and i have the data but this is data from the web. I know i need to turn it into a list and then put that into a DataFrame function but i am unable to figure out how to put this thing into a list.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd


PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
    if idx < number_of_players:
        down_arrow.click()
        time.sleep(.5)
    else:
        break

if round_to_select < 4:
    for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
        if idx < number_of_players:
            Select(menu).select_by_visible_text(f'Round {round_to_select}')
            time.sleep(.5)
        else:
            break

R1_page_source = driver.page_source

R1_soup = BeautifulSoup(R1_page_source, 'html.parser')

R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')

for R1_player in R1_leaderboard.find_all('tbody'):
    R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
    for R1_row in R1_rows:
        R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
        R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
        R1_Players = R1_row.find('a').text
        R1_Round = R1_row.find_all("select")[1].text
        R1_H1 = R1_row.find_all('span')[1].text
        R1_H2 = R1_row.find_all('span')[2].text
        R1_H3 = R1_row.find_all('span')[3].text
        R1_H4 = R1_row.find_all('span')[4].text
        R1_H5 = R1_row.find_all('span')[5].text
        R1_H6 = R1_row.find_all('span')[6].text
        R1_H7 = R1_row.find_all('span')[7].text
        R1_H8 = R1_row.find_all('span')[8].text
        R1_H9 = R1_row.find_all('span')[9].text
        R1_H10 = R1_row.find_all('span')[11].text
        R1_H11 = R1_row.find_all('span')[12].text
        R1_H12 = R1_row.find_all('span')[13].text
        R1_H13 = R1_row.find_all('span')[14].text
        R1_H14 = R1_row.find_all('span')[15].text
        R1_H15 = R1_row.find_all('span')[16].text
        R1_H16 = R1_row.find_all('span')[17].text
        R1_H17 = R1_row.find_all('span')[18].text
        R1_H18 = R1_row.find_all('span')[19].text
        print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)


Solution 1:[1]

here's a modified version of your code. i used pandas.read_html to turn the html table into a dataframe.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd

# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
  if idx < number_of_players:
    try:
      down_arrow.click()
      time.sleep(.5)
    except:
      pass
  else:
      break

if round_to_select < 4:
  for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
    if idx < number_of_players:
      try:
        Select(menu).select_by_visible_text(f'Round {round_to_select}')
        time.sleep(.5)
      except:
        pass
    else:
        break

R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]

prettify turns the bs4 object into a normal string that pandas can process. read_html actually returns a list of dataframes - but in this case there's only one, which is why i added [0] on the end. when i run R1_df i get this:

    Unnamed: 0  POS     PLAYER  SCORE   R1  R2  R3  R4  TOT     EARNINGS    FEDEX PTS
0   NaN     1   Scottie Scheffler   -10     69  67  71  71  278     $2,700,000  600
1   NaN     2   Rory McIlroy    -7  73  73  71  64  281     $1,620,000  330
2   NaN     T3  Shane Lowry     -5  73  68  73  69  283     $870,000    180
3   NaN     T3  Cameron Smith   -5  68  74  68  73  283     $870,000    180
4   NaN     5   Collin Morikawa     -4  73  70  74  67  284     $600,000    120
...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...
86  NaN     -   Stewart Hagestad (a)    CUT     79  81  --  --  160     --  0
87  NaN     -   José María Olazábal     CUT     77  84  --  --  161     --  0
88  NaN     -   Laird Shepherd (a)  CUT     81  85  --  --  166     --  0
89  NaN     -   Louis Oosthuizen    WD  76  --  --  --  76  --  0
90  NaN     -   Paul Casey  WD  --  --  --  --  --  --  0

hopefully this is what you were looking for! because pandas can process html directly, there wasn't actually a need to make any intermediary lists.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 eva bacas