'How to set a function to scrap data from multiple pages?
I was trying to scrap some data from a Court here in Brazil (yes, it is legal), and everything is going fine, except for one thing: I am not able to set a function that click
on the button to change the page and re-do the scrap code and add the
data to the df (I have tried to use the var set to the scrap + .append(), without success - am I using it correctly?)
Any advice?
MWE:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"
option = Options()
option.headless = True
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5)
driver.find_element(by=By.LINK_TEXT, value='§ 6o A ação será instruída com documentos ou justificação que contenham indícios suficientes da existência do ato de improbidade ou com razões fundamentadas da impossibilidade de apresentação de qualquer dessas provas, observada a legislação vigente, inclusive as disposições inscritas nos arts. 16 a 18 do Código de Processo Civil. (Incluído pela Medida Provisória nº 2.225-45, de 2001)').click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[-1]) # Change the focus to the new page, otherwise I can not scrap its content
element = driver.find_element(by=By.CLASS_NAME, value="listadocumentos")
html_content = element.get_attribute('outerHTML')
resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
resultados # Also tried resultados.append()
paragrafoBRS # Also tried paragrafoBRS.append()
driver.quit()
header = []
content = []
for each in paragrafoBRS:
header.append(each.find('div', {'class':'docTitulo'}).text)
content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
dataDict = {}
df = pd.DataFrame()
for i in range(len(header)):
if header[i] in dataDict:
df = df.append(pd.DataFrame(dataDict), ignore_index=True)
dataDict = {}
dataDict[header[i]] = [content[i]]
df.to_excel('data.xlsx')
Solution 1:[1]
After click() selenium automatically updates driver.page_source but not resultados and other variables and you have to run again
resultados = BeautifulSoup(driver.page_source, 'lxml')
# rest
And this replaces old content so you should first run for each in paragrafoBRS: to get data from first page, next click(), next load again resultados and run again for each in paragrafoBRS: to get data from next page.
Something like this:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"
option = Options()
option.headless = True
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5)
driver.find_element(by=By.LINK_TEXT, value='§ 6o A ação será instruída com documentos ou justificação que contenham indícios suficientes da existência do ato de improbidade ou com razões fundamentadas da impossibilidade de apresentação de qualquer dessas provas, observada a legislação vigente, inclusive as disposições inscritas nos arts. 16 a 18 do Código de Processo Civil. (Incluído pela Medida Provisória nº 2.225-45, de 2001)').click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[-1]) # Change the focus to the new page, otherwise I can not scrap its content
element = driver.find_element(by=By.CLASS_NAME, value="listadocumentos")
html_content = element.get_attribute('outerHTML')
# --- before all pages --
header = []
content = []
# --- get data from first page --
resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
for each in paragrafoBRS:
header.append(each.find('div', {'class':'docTitulo'}).text)
content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
print('len(content):', len(content))
# --- load next page ---
print('click')
driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
time.sleep(5)
# --- get data from next page --
resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
for each in paragrafoBRS:
header.append(each.find('div', {'class':'docTitulo'}).text)
content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
print('len(content):', len(content))
# ---
driver.quit()
dataDict = {}
df = pd.DataFrame()
for i in range(len(header)):
if header[i] in dataDict:
df = df.append(pd.DataFrame(dataDict), ignore_index=True)
dataDict = {}
dataDict[header[i]] = [content[i]]
df.to_excel('data.xlsx')
EDIT:
I couldn't test it because server refuses connection at this moment.
If you want to run in for-loop to get ie. 5 pages
# ... code ...
# --- before all pages --
header = []
content = []
# --- loop ---
for n in range(5):
print('--- page:', n+1, '---')
# --- get data from page --
resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
for each in paragrafoBRS:
header.append(each.find('div', {'class':'docTitulo'}).text)
content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
print('len(content):', len(content))
# --- load next page ---
print('click')
driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
time.sleep(5)
# --- after loop ---
driver.quit()
dataDict = {}
df = pd.DataFrame()
# ... code ...
If you want to run in while-loop to get all pages.
I use try/except to detect when it has problem to click link to next page - and then I use break to exit loop
# ... code ...
# --- before all pages --
header = []
content = []
# --- loop ---
page = 0
while True:
page += 1
print('--- page:', page, '---')
# --- get data from page --
resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
for each in paragrafoBRS:
header.append(each.find('div', {'class':'docTitulo'}).text)
content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
print('len(content):', len(content))
# --- load next page ---
try:
print('click')
driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
time.sleep(5)
except Exception as ex:
# exit loop when problem with clicking
print('Exception:', ex)
break # exit loop when
# --- after loop ---
driver.quit()
dataDict = {}
df = pd.DataFrame()
# ... code ...
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
