'python-download all images from the webpages and into a file

I would like to download all images from this webpage, but my code are not workable. How can I revise that given this script.

import requests,os
from bs4 import BeautifulSoup
from urllib.request import urlopen

html=requests.get('https://www.dreamstime.com/free-results.php?securitycheck=afbb79db0e7e374867295876228b135a&firstvalue=&lastsearchvalue=&srh_field=doges&searchby=doges&s_free=y&s_cc0=y',headers={"User-Agent": "XY"})
html.encoding='utf-8'

sp=BeautifulSoup(html.text,'html.parser')
images_dir="images/"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

all_links=sp.find_all(['a','img'])
for link in all_links:
    src=link.get("src")
    href=link.get("href")
    attrs=[src,href]
    for attr in attrs:
        if attr != None and ('.jpg' in attr or '.png' in attr):
            full_path=attr
            filename=full_path.split('/')[-1]
            print(full_path)
            
            try:
                image=urlopen(full_path)
                f=open(os.path.join(images_dir,filename),'wb')
                f.write(image.read())
                f.close()
            except:
                print("{} fail".format(filename))


Solution 1:[1]

import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = requests.get('https://www.google.de/?gws_rd=ssl', headers={"User-Agent": "XY"})
my_domain = "https://www.google.de"
# your problem is that the links are relative and do not have the domain name.
# You should check if http is present in the link,
# if not check if there  is a base element "<base href="https://example.com">"
# if there is add that to the URI, if not add the "my_domain"
# I cheated here and just added the my_domain directly, which will cause you issues if it is present.
html.encoding = 'utf-8'
sp = BeautifulSoup(html.text, 'html.parser')
images_dir = "images/"

# print(f"My images path {os.path.exists(images_dir)}")

if not os.path.exists(images_dir):
    print(f"was not found {os.path(images_dir)}")
    os.mkdir(images_dir)

all_links = sp.find_all(['a', 'img'])

# print(all_links)

for link in all_links:
    src = link.get("src")
    # print(f"src= {src}")
    href = link.get("href")
    # print(f"href= {href}")
    attrs = [src, href]
    # print(f"attrs= {attrs}")
    for attr in attrs:
        if attr and ('.jpg' in attr or '.png' in attr):
            my_path = attr
            full_path = my_domain + my_path
            # print(full_path)
            filename = my_path.split('/')[-1]
            print(f"My image ref -> {my_path}")
            print(f"My full image ref -> {full_path}")
            print(f"My image filename-> {filename}")

            try:
                image = urlopen(full_path)
                f = open(os.path.join(images_dir, filename), 'wb')
                f.write(image.read())
                f.close()
            except NameError:
                print("{} fail".format(filename))

Solution 2:[2]

I have made a new version, which works a bit better with site captchas. First I tried a useragent header, but sites ignore that offten. If you want you can read out the cookie of a site first then add that too. Here are samples that can be added as needed, with the data you collect when from the site. # "cookie": "__gads=ID=b2281f3501a53....." # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win6....."

I must stress, this was an intellectual exercise, I would never take content I did not own, or am allowed to use only in a specific context. I see this code as a simple way to analyse issues with specific tags or take an inventory of which tags I am using for what.

import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

my_domain_short = "www.google.de"
my_domain = f"https://{my_domain_short}"

# This tells the domain I am a human not a robot, to avoid Captcha's
html = requests.get(my_domain,
                    headers={
                        "Accept": "application/signed-exchange;v=b3;q=0.7,*/*;q=0.8",
                        "Accept-Encoding": "gzip, deflate, br",
                        "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,de;q=0.7,nl;q=0.6",
                        "Host": my_domain_short,
                        "Purpose": "prefetch",
                        "Referer": my_domain,
                    })

html.encoding = 'utf-8'
sp = BeautifulSoup(html.text, 'html.parser')
tag_base = sp.base
print(tag_base)

# This is where I want to put my images locally on my machine.
images_dir = "images/"
print(f"My images path {os.path.exists(images_dir)}")

# if the images path does not exist make it.
if not os.path.exists(images_dir):
    print(f"was not found {os.path(images_dir)}")
    os.mkdir(images_dir)

# get all <a img=src tags>
all_links = sp.find_all(['a', 'img'])

# is there a Base href tag for the page suggesting relative URLs
base_path = sp.find('base href')
print(f"My base Path: {base_path}")


# print(all_links) # Print all found results, by removing comment "#"

# loop through results and get the pictures
for link in all_links:
    src = link.get("src")
    # print(f"src= {src}") # tests code
    href = link.get("href")
    # print(f"href= {href}") # tests code
    attrs = [src, href]
    # print(f"attrs= {attrs}") # tests code
    for attr in attrs:
        if attr and ('.jpg' in attr or '.png' in attr):
            my_path = attr
            print(f"My image ref -> {my_path}")
            # If the URI is Absolute, then ok URI is good, otherwise add the base path or my_path to it.
            if "http" in my_path:
                full_path = my_path
            else:
                full_path = my_domain + my_path
            print(full_path)
            filename = my_path.split('/')[-1]
            print(f"My full image ref -> {full_path}")
            print(f"My image filename-> {filename}")
            # Try and write the file to my local computer
            try:
                image = urlopen(full_path)
                f = open(os.path.join(images_dir, filename), 'wb')
                f.write(image.read())
                f.close()
            except NameError:
                print("{} fail".format(filename))

This is another simpler version which can be run as an import, or from the console or as a script, this one, in this case just lists the contents of a robots.txt as an example.

import sys
from urllib.request import urlopen


def get_content(url):
    story = urlopen(url)
    story_words = []
    for line in story:
        line_words = line.decode("utf8").split()
        for word in line_words:
            story_words.append(word)
    story.close()
    return story_words  # if called through import words -> get_content() returns values


# how to stop function being used when importing, how to list imported functions
def print_content(story_words):
    """
    prints an input to the console, can be used in import
    """
    for word in story_words:
        print(word)


def main(url):
    """calls a function and calls a print function to print values
    Args: url: The URL of a UTF-8 text document.
    Returns: A list of strings from website.
    """
    # url = sys.argv[1] done in main instead for command line argument so that it is not run when importing
    words = get_content(url)  # calls fetch words, which returns values
    print_content(words)  # passes words to print function and prints returned values


if __name__ == '__main__':  # run as a script or if imported is ignored and
    # if called through import words -> get_content() returns values
    #  .\script.py https://www.google.com/robots.txt

    try:
        main(sys.argv[1])
    # this takes the input from console after command # I could have just called url and in main used url = sys.argv[1]
    # if the file is run directly then it just passes the url here.
    except IndexError:
        print("list index out of range")
        print('Module not run from command line passing ".\script.py https://www.google.com/robots.txt"')
        main("https://www.google.com/robots.txt")

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Douglas Bryant
Solution 2 Douglas Bryant