'how i can open up many links with chromedriver that are stored in one variable

So i wrote this code for now to web-scrape cnn and get articles about a specific topic:

from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import  ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os

serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'

url = f'https://edition.cnn.com/search?q={real_serch}'

options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)

driver.get(url)
time.sleep(4)

soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()

SAVED_DATA = "data.json"

def save_data(filepath, data):
    with open(filepath, "w") as f:
        json.dump(data, f)
    

def load_data(filepath):
    try:
        with open(filepath, "r") as f:
            data = json.load(f)
            return data
    except:
        return {}

def only_get_title():
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title=h3.text
        return(title)

def get_href():
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        url_ = h3.get('href')
        abs_url = 'https:'+ url_
        return(abs_url)

def store():
        data = load_data(SAVED_DATA)
        key = only_get_title()
        data[key] = get_href()
        save_data(SAVED_DATA, data)
        print("News saved!")


if __name__ == '__main__':
    store()

my question is that in abs_url are stored many links, of the different articles taht were found on that subject on cnn so, i want to go to every of these links and save the data, but it will only open up the first link stored in abs_url and not the other how can i do that i open up every link and save every link in my json file as you can see in the code



Solution 1:[1]

You run return inside for-loop so you exit function at first link.

You should add all links to list and use return yourlist after for-loop

def get_href():
    all_results = []

    # --- loop ---

    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        url_ = h3.get('href')
        abs_url = 'https:'+ url_

        all_results.append( abs_url)

    # --- after loop -- 

    return all_results

The same problem you have with titles

def only_get_title():

    all_results = []
 
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        all_results.append(title)

    # --- after loop -- 

    return all_results

and later you will need to use for-loop with zip() to create pairs (title, url)

def store():
    data = load_data(SAVED_DATA)
    
    all_titles = only_get_title()
    all_urls = get_href()
    
    for title, url in zip(all_titles, all_urls):
        data[title] = url
        
    save_data(SAVED_DATA, data)
    print("News saved!")

But maybe it would be simpler and more readable to get title and url in one function and create pairs when you add to list

def get_articles():
    all_results = []

    # --- loop ---

    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        
        title = h3.text
        url = h3.get('href')
        abs_url = 'https:'+ url

        pair = (title, abs_url)

        all_results.append( pair )

    # --- after loop -- 

    return all_results

def store():
    data = load_data(SAVED_DATA)
    
    all_articles = get_articles()
    
    for title, url in all_articles:
        data[title] = url
        
    save_data(SAVED_DATA, data)
    print("News saved!")

And this can be also safer when you want to get more details from article because if articel doesn't have some details then you can add None or default value. Using separated function it may skip empty elements and later zip() will create wrog pairs (tuples)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1