'Weird behaviour when Goodreads scraping (Python)
I'm trying to scrape Goodreads and more specifically Goodreads editions by giving some ISBNs as input. However, I get an error and not even at the same step every time of the code running process:
Traceback (most recent call last):
File "C:xxx.py", line 47, in <module>
ed_details = get_editions_details(isbn)
File "C:xxx.py", line 30, in get_editions_details
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
AttributeError: 'NoneType' object has no attribute 'find'
Everything should be correct, the div class is the correct one and it seems like is there for all books. I checked with every browser and the page looks the same to me. I don't know if it's because of a deprecated library or something at this point.
import requests
from bs4 import BeautifulSoup as bs
def get_isbn():
isbns = ['9780544176560', '9781796898279', '9788845278518', '9780374165277', '9781408839973', '9788838919916', '9780349121994', '9781933372006', '9781501167638', '9781427299062', '9788842050285', '9788807018985', '9780340491263', '9789463008594', '9780739349083', '9780156011594', '9780374106140', '9788845251436', '9781609455910']
return isbns
def get_page(base_url, data):
try:
r = requests.get(base_url, params=data)
except Exception as e:
r = None
print(f"Server responded: {e}")
return r
def get_editions_details(isbn):
# Create the search URL with the ISBN of the book
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
# Parse the markup with Beautiful Soup
soup = bs(book_url.text, 'lxml')
# Retrieve from the book's page the link for other editions
# and the total number of editions
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
# Return a tuple with all the informations
return ((ed_link, int(ed_num), isbn))
if __name__ == "__main__":
# Get the ISBNs from the user
isbns = get_isbn()
# Check all the ISBNs
for isbn in isbns:
ed_details = get_editions_details(isbn)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
