'WebScraping in Python returning repeated data

I'm using the script below to retrieve property data for a college project. It is working without errors, but the dataframe has repeated values, that is, if I put it to fetch data from pages it repeats the same data from page 1 5 times, please help!

import requests, re, time, os, csv
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

# Inicializamos as listas para guardar as informações

link_imovel=[] # nesta lista iremos guardar a url
address=[]     # nesta lista iremos guardar o endereço
neighbor=[]    # nesta lista iremos guardar o bairro
anunciante=[]  # nesta lista iremos guardar o anunciante 
area=[]        # nesta lista iremos guardar a area
tipo=[]        # nesta lista iremos guardar o tipo de imóvel
room=[]        # nesta lista iremos guardar a quantidade de quartos
bath=[]        # nesta lista iremos guardar a quantidade de banheiros
park=[]        # nesta lista iremos guardar a quantidade de vagas de garagem
price=[]       # nesta lista iremos guardar o preço do imóvel

# Ele irá solicitar quantas páginas você deseja coletar
pages_number=int(input('How many pages? '))
# inicializa o tempo de execução
tic = time.time()

# Configure chromedriver
# para executar, é necessário que você baixe o chromedriver e deixe ele na mesma pasta de execução, ou mude o path
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
time.sleep(15)

# Criando o loop entre as paginas do site
for page in range(1,pages_number+1):

    link = 'https://www.vivareal.com.br/venda/minas-gerais/pocos-de-caldas/casa_residencial/?pagina='+str(page)+''
    driver.get(link)
    # Definimos um sleep time para não sobrecarregar o site
    
    # coletamos todas as informações da página e transformamos em formato legivel
    data = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup_complete_source = BeautifulSoup(data.encode('utf-8'), "lxml")
    
    # identificamos todos os itens de card de imóveis
    soup = soup_complete_source.find(class_='results-list js-results-list')    
    

    # Web-Scraping
    # para cada elemento no conjunto de cards, colete:
    for line in soup.findAll(class_="js-card-selector"):
        # colete o endereço completo e o bairro
        try:
            full_address=line.find(class_="property-card__address").text.strip()
            address.append(full_address.replace('\n', '')) #Get all address
            if full_address[:3]=='Rua' or full_address[:7]=='Avenida' or full_address[:8]=='Travessa' or full_address[:7]=='Alameda':
                neighbor_first=full_address.strip().find('-')
                neighbor_second=full_address.strip().find(',', neighbor_first)
                if neighbor_second!=-1:
                    neighbor_text=full_address.strip()[neighbor_first+2:neighbor_second]
                    neighbor.append(neighbor_text) # Guarde na lista todos os bairros
                else: # Bairro não encontrado
                    neighbor_text='-'
                    neighbor.append(neighbor_text) # Caso o bairro não seja encontrado
            else:
                get_comma=full_address.find(',')
                if get_comma!=-1:
                    neighbor_text=full_address[:get_comma]
                    neighbor.append(neighbor_text) # Guarde na lista todos os bairros com problema de formatação provenientes do proprio website  
                else:
                    get_hif=full_address.find('-')
                    neighbor_text=full_address[:get_hif]
                    neighbor.append(neighbor_text)
                    
            # Coleta o link
            full_link=line.find(class_='property-card__main-info').a.get('href')
            link_imovel.append(full_link)
                    
            # Coleta o anunciante
            full_anunciante=line.find(class_='property-card__account-link js-property-card-account-link').img.get('alt').title()
            anunciante.append(full_anunciante)
                    
            # Coleta a área  
            full_area=line.find(class_="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area").text.strip()
            area.append(full_area)
            
            # Coleta tipologia
            full_tipo = line.find(class_='property-card__title js-cardLink js-card-title').text.split()[0]
            full_tipo=full_tipo.replace(' ','')
            full_tipo=full_tipo.replace('\n','')
            tipo.append(full_tipo)

            # Coleta numero de quartos
            full_room=line.find(class_="property-card__detail-item property-card__detail-room js-property-detail-rooms").text.strip()
            full_room=full_room.replace(' ','')
            full_room=full_room.replace('\n','')
            full_room=full_room.replace('Quartos','')
            full_room=full_room.replace('Quarto','')
            room.append(full_room) #Get apto's rooms

            # Coleta numero de banheiros
            full_bath=line.find(class_="property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom").text.strip()        
            full_bath=full_bath.replace(' ','')
            full_bath=full_bath.replace('\n','')
            full_bath=full_bath.replace('Banheiros','')
            full_bath=full_bath.replace('Banheiro','')
            bath.append(full_bath) #Get apto's Bathrooms

            # Coleta numero de vagas de garagem
            full_park=line.find(class_="property-card__detail-item property-card__detail-garage js-property-detail-garages").text.strip()        
            full_park=full_park.replace(' ','')
            full_park=full_park.replace('\n','')
            full_park=full_park.replace('Vagas','')
            full_park=full_park.replace('Vaga','')
            park.append(full_park) #Get apto's parking lot

            # Coleta preço
            full_price=re.sub('[^0-9]','',line.find(class_="property-card__price js-property-card-prices js-property-card__price-small").text.strip())
            price.append(full_price) #Get apto's parking lot

        except:
            continue
    
              
# fecha o chromedriver
driver.quit()

# cria um dataframe pandas e salva como um arquivo CSV
for i in range(0,len(neighbor)):
    combinacao=[link_imovel[i],address[i],neighbor[i],anunciante[i],area[i],tipo[i],room[i],bath[i],park[i],price[i]]
    df=pd.DataFrame(combinacao)
    with open('VivaRealData.csv', 'a', encoding='utf-16', newline='') as f:
        df.transpose().to_csv(f, encoding='iso-8859-1', header=False)

# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print('Finished in ' + str(get_time) + ' seconds')
print(str(len(price))+' results!')

it seems to me that the "for line in soup.findAll" never pops, I've tried everything but I always get data from the first page.



Solution 1:[1]

Indeed the URL does return the same results regardless of the page number requested. It also returns the same information if requests is used avoiding the huge overhead of using Selenium.

A better (and much faster) approach is to access all of the data directly from the site's JSON API.

The following shows you a possible starting point. All of the data is inside data, you just need to find the information you want inside it and access it. I suggest you print(data) and use a tool to format it better.

import requests, re, time, os, csv

# Ele irá solicitar quantas páginas você deseja coletar
#pages_number = int(input('How many pages? '))
pages_number = 5
# inicializa o tempo de execução
tic = time.time()

sess = requests.Session()

params = {
    'addressCity' : 'Poços de Caldas',
    'addressLocationId' : 'BR>Minas Gerais>NULL>Pocos de Caldas',
    'addressNeighborhood' : '',
    'addressState' : 'Minas Gerais',
    'addressCountry' : 'Brasil',
    'addressStreet' : '',
    'addressZone' : '',
    'addressPointLat' : '-21.7854',
    'addressPointLon' : '-46.561934',
    'business' : 'SALE',
    'facets' : 'amenities',
    'unitTypes' : 'HOME',
    'unitSubTypes' : 'UnitSubType_NONE,SINGLE_STOREY_HOUSE,VILLAGE_HOUSE,KITNET',
    'unitTypesV3' : 'HOME',
    'usageTypes' : 'RESIDENTIAL',
    'listingType' : 'USED',
    'parentId' : 'null',
    'categoryPage' : 'RESULT',
    'includeFields' : 'search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount),page,seasonalCampaigns,fullUriFragments,nearby(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),expansion(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones,phones),developments(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),owners(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount))',
    'size' : '100',
    'from' : '144',
    'q' : '',
    'developmentsSize' : '5',
    '__vt' : '',
    'levels' : 'CITY,UNIT_TYPE',
    'ref' : '/venda/minas-gerais/pocos-de-caldas/casa_residencial/',
    'pointRadius' : '',
    'isPOIQuery' : '',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
    'x-domain': 'www.vivareal.com.br',
}

results = 0

with open('VivaRealData.csv', 'w', newline='', encoding='utf-16') as f_output:
    csv_output = csv.writer(f_output)

    # Criando o loop entre as paginas do site
    for page in range(pages_number+1):
        print(f"Page {page+1}")
        link = 'https://glue-api.vivareal.com/v2/listings'
        params['from'] = f"{page * 100}"        
        req = sess.get(link, headers=headers, params=params)
        data = req.json()

        for listing in data['search']['result']['listings']:
            href = listing['link']['href']
            street = listing['listing']['address'].get('street', '').strip()
            bedrooms = listing['listing']['bedrooms'][0]
            bathrooms = listing['listing']['bathrooms'][0]
            price = listing['listing']['pricingInfos'][0]['price']
            row = [href, street, bedrooms, bathrooms, price]
            csv_output.writerow(row)
            results += 1

# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)

print(f'Finished in {get_time} seconds')
print(f'{results} results!')

For this example, it is hard coded to 5 pages and returns 593 results in about 6 seconds.

Using Pandas might be a bit overkill here as the data can be written a row at a time directly to your output CSV file.

How was this solved?

Your best friend here is your browser's network dev tools. With this you can watch the requests made to obtain the information. The normal process flow is the initial HTML page is downloaded, this runs the javascript and requests more data to further fill the page.

The trick is to first locate where the data you want is (often returned as JSON), then determine what you need to recreate the parameters needed to make the request for it.

Approaches using Selenium allow the javascript to work, but most times this is not needed as it is just making requests and formatting the data for display.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Martin Evans