'Create a webscraper with loop to search links
I'm trying to create a websraper that will return restaurant names and addresses from the website. In the current version, it returns only names (as a test), but they are saved in the form of a string ([{'name': 'Copernicus Restaurant | Copernicus Hotel'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}, {'name': 'Cyrano de Bergerac'}]).
Could someone help me to correct this code so that it would take links to each restaurant and then extract data about the name of the restaurant, address from those links?
I will be grateful for any help.
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(productlinks)
restlist = []
for link in productlinks:
r = driver.get(link)
soup = BeautifulSoup(driver.page_source, 'lxml')
name = soup.find('h1', class_='notranslate').text.strip()
# address = soup.find('div', class_='address')
# try:
# website = soup.find('a', href=True)
# except:
# website = 'NULL'
rest ={
'name': name,
# 'website': website,
# 'address': address
}
restlist.append(rest)
print(restlist)
driver.quit()
Edited code with wrong result:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
driver = webdriver.Chrome()
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Gostynin-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element(By.XPATH, '//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element(By.XPATH, '//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element(By.XPATH, '//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
website = ''
rest = {
'name': name,
'website': website,
'address': address,
}
restlist.append(rest)
print(restlist)
#df = pd.DataFrame(restlist)
#df.to_csv('C:/webdrivers/restauracje.csv')
#print(df.head(10))
driver.quit()
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
