'Webscraping - html issue

Ive been trying this code and getting some success but cannot figure out the next step

import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh

import pprint
import json



url2 = "https://www.procyclingstats.com/rankings.php"



print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)

#why do I only get two headers here (prev, team)?





response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
 #  print(dfs)
   
print(colored('#Step1','red'))


print(colored('#Step2','green'))

url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"

response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)



# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]

response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #print(dfs2)
 

child_soup = soup.find('h3')
    
for i in child_soup.children:
      print("child :  ", i)
    
print('\n'*3)

I end up with the child as Rider (result text below)

Ive been trying this code and getting some success but cannot figure out the next step

import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh

import pprint
import json



url2 = "https://www.procyclingstats.com/rankings.php"



print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)

#why do I only get two headers here (prev, team)?





response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
 #  print(dfs)
   
print(colored('#Step1','red'))


print(colored('#Step2','green'))

url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"

response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)



# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]

response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #print(dfs2)
 

child_soup = soup.find('h3')
    
for i in child_soup.children:
      print("child :  ", i)
    
print('\n'*3)

I end up with the child as Rider (result text below)

#Step2
child :   Rider

What Im trying to capture is the 'points per speciality' and the values.

There is a second question as to why I only get two tags when they all appear to have the same name?

Photo and arrow showing desired result

Solution 1:^[1]

import re

url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]


url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"

response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
    m = re.search("(\d+)(.*)", l.text)
    d[m.group(2)] = m.group(1)

print(headers)
print(d)

# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1	A D

'Webscraping - html issue

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]