'Webscraping - html issue
Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
#Step2
child : Rider
What Im trying to capture is the 'points per speciality' and the values.
There is a second question as to why I only get two tags when they all appear to have the same name?
Solution 1:[1]
import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(\d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | A D |
