'How to set a function to scrap data from multiple pages?

I was trying to scrap some data from a Court here in Brazil (yes, it is legal), and everything is going fine, except for one thing: I am not able to set a function that click on the button to change the page and re-do the scrap code and add the data to the df (I have tried to use the var set to the scrap + .append(), without success - am I using it correctly?)

Any advice?

MWE:

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"

option = Options()
option.headless = True
driver = webdriver.Firefox()

driver.get(url)
time.sleep(5)

driver.find_element(by=By.LINK_TEXT, value='§ 6o A ação será instruída com documentos ou justificação que contenham indícios suficientes da existência do ato de improbidade ou com razões fundamentadas da impossibilidade de apresentação de qualquer dessas provas, observada a legislação vigente, inclusive as disposições inscritas nos arts. 16 a 18 do Código de Processo Civil. (Incluído pela Medida Provisória nº 2.225-45, de 2001)').click()

time.sleep(5)

driver.switch_to.window(driver.window_handles[-1]) # Change the focus to the new page, otherwise I can not scrap its content

element = driver.find_element(by=By.CLASS_NAME, value="listadocumentos")
html_content = element.get_attribute('outerHTML')

resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})

driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()

resultados # Also tried resultados.append()
paragrafoBRS # Also tried paragrafoBRS.append()

driver.quit()

header = []
content = []

for each in paragrafoBRS:
    header.append(each.find('div', {'class':'docTitulo'}).text)
    content.append(each.find(['div','pre'], {'class':'docTexto'}).text)

dataDict = {}
df = pd.DataFrame()

for i in range(len(header)):
    if header[i] in dataDict:
        df = df.append(pd.DataFrame(dataDict), ignore_index=True)
        dataDict = {}

    dataDict[header[i]] = [content[i]]

df.to_excel('data.xlsx')


Solution 1:[1]

After click() selenium automatically updates driver.page_source but not resultados and other variables and you have to run again

resultados = BeautifulSoup(driver.page_source, 'lxml') 
# rest

And this replaces old content so you should first run for each in paragrafoBRS: to get data from first page, next click(), next load again resultados and run again for each in paragrafoBRS: to get data from next page.

Something like this:

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"

option = Options()
option.headless = True
driver = webdriver.Firefox()

driver.get(url)
time.sleep(5)

driver.find_element(by=By.LINK_TEXT, value='§ 6o A ação será instruída com documentos ou justificação que contenham indícios suficientes da existência do ato de improbidade ou com razões fundamentadas da impossibilidade de apresentação de qualquer dessas provas, observada a legislação vigente, inclusive as disposições inscritas nos arts. 16 a 18 do Código de Processo Civil. (Incluído pela Medida Provisória nº 2.225-45, de 2001)').click()

time.sleep(5)

driver.switch_to.window(driver.window_handles[-1]) # Change the focus to the new page, otherwise I can not scrap its content

element = driver.find_element(by=By.CLASS_NAME, value="listadocumentos")
html_content = element.get_attribute('outerHTML')

# --- before all pages --

header = []
content = []

# --- get data from first page --

resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})

for each in paragrafoBRS:
    header.append(each.find('div', {'class':'docTitulo'}).text)
    content.append(each.find(['div','pre'], {'class':'docTexto'}).text)

print('len(content):', len(content))

# --- load next page ---

print('click')
driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()

time.sleep(5)

# --- get data from next page --

resultados = BeautifulSoup(driver.page_source, 'lxml')
paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})

for each in paragrafoBRS:
    header.append(each.find('div', {'class':'docTitulo'}).text)
    content.append(each.find(['div','pre'], {'class':'docTexto'}).text)

print('len(content):', len(content))

# ---

driver.quit()

dataDict = {}
df = pd.DataFrame()

for i in range(len(header)):
    if header[i] in dataDict:
        df = df.append(pd.DataFrame(dataDict), ignore_index=True)
        dataDict = {}

    dataDict[header[i]] = [content[i]]

df.to_excel('data.xlsx')

EDIT:

I couldn't test it because server refuses connection at this moment.

If you want to run in for-loop to get ie. 5 pages

# ... code ...

# --- before all pages --

header = []
content = []

# --- loop ---

for n in range(5):
    print('--- page:', n+1, '---')

    # --- get data from page --
    
    resultados = BeautifulSoup(driver.page_source, 'lxml')
    paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
    
    for each in paragrafoBRS:
        header.append(each.find('div', {'class':'docTitulo'}).text)
        content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
    
    print('len(content):', len(content))
    
    # --- load next page ---
    
    print('click')
    driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
    
    time.sleep(5)

# --- after loop ---

driver.quit()

dataDict = {}
df = pd.DataFrame()

# ... code ...

If you want to run in while-loop to get all pages.

I use try/except to detect when it has problem to click link to next page - and then I use break to exit loop

# ... code ...

# --- before all pages --

header = []
content = []

# --- loop ---

page = 0

while True:
    
    page += 1
    
    print('--- page:', page, '---')
    
    # --- get data from page --
    
    resultados = BeautifulSoup(driver.page_source, 'lxml')
    paragrafoBRS = resultados.find_all('div', attrs={'class':'paragrafoBRS'})
    
    for each in paragrafoBRS:
        header.append(each.find('div', {'class':'docTitulo'}).text)
        content.append(each.find(['div','pre'], {'class':'docTexto'}).text)
    
    print('len(content):', len(content))
    
    # --- load next page ---
    
    try:
        print('click')
        driver.find_element(by=By.CLASS_NAME, value='iconeProximaPagina.temHint').click()
        
        time.sleep(5)
    except Exception as ex:
        # exit loop when problem with clicking
        print('Exception:', ex)
        break  # exit loop when
    
# --- after loop ---

driver.quit()

dataDict = {}
df = pd.DataFrame()

# ... code ...

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1