'scape pages on wiki category using python

Need to scrape wiki pages in certain categories. Found this answer Python scraping of Wikipedia category page. Tried the code for https://en.wikipedia.org/wiki/Category:Allergology the code just jump to the first subcategory. for Allergologists‎ but I need scape all the pages on this page.

Made minor changes to the above code to scape english text.

import requests
from bs4 import BeautifulSoup

def get_categories(data):
    print("Getting categories...")
    categories = {}
    soup = BeautifulSoup(data, "lxml")
    group_divs = soup.find_all("div", {"class": "mw-category-group"})
    for div in group_divs:
        links = div.find_all("a")
        for link in links:
            title = link.get("title")
            href = link.get("href")
            categories[title] = "https://en.wikipedia.org" + href
    print(f"Found Categories: {len(categories)}")
    return categories


def get_first_paragraph(data):
    soup = BeautifulSoup(data, "lxml")
    parser_output = soup.find("div", {"class": "mw-parser-output"})
    first_paragraph = parser_output.find("p", {"class": None}, recursive=False)
    return first_paragraph.text


def process_categories(categories):
    result = {}
    for title, link in categories.items():
        print(f"Processing Piece: {title}, on link: {link}")
        data = requests.get(link).content
        first_paragraph = get_first_paragraph(data)
        result[title] = first_paragraph.strip()
    return result


def clean_categories(categories):
    return {k: v for k, v in categories.items() if "Catégorie" not in k}

categories_url = "https://en.m.wikipedia.org/wiki/Category:Allergology"
data_categories = requests.get(categories_url).content
categories = get_categories(data_categories)
categories = clean_categories(categories)
result = process_categories(categories)
print(result) # create dataframe etc...

But only got

Getting categories...
Found Categories: 96
Processing Piece: Category:Allergologists, on link: https://en.wikipedia.org/wiki/Category:Allergologists

also return error as

first_paragraph.text  AttributeError: 'NoneType' object has no attribute 'text'

Another solution might be manually created a list of related page links and loop through it but it's time consuming. Any suggestion?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source