'How to download dynamically loaded images using python and seleniumwire?

First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.

from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options       # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options()                                            # [!]
opts.add_experimental_option('w3c', True)                   # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)

url = ' here comes url'
driver.get(url)

def get_requests():
    l = []
    for rx in driver.requests:
        #endmark = '&scale=2&rotate=0' lenght must be 17
        if rx.url[-17:]==endmark:
            l.append(rx.url)
    return list(set(l))
    
def savepages(diff):
    newpages = 0
    for urlitem in diff:
        for request in driver.requests:
            if request.url==urlitem:
                #print(request.url)
                ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
                file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
                tik = 0
                while tik<10:  #waiting for the response body data
                    try:
                        tik += 1
                        data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
                    except AttributeError: # no data error
                        time.sleep(2) # wait for 2 sec for the data
                        continue                    
                #data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
                if data: 
                    with open(file_path, 'wb') as outfile:
                        outfile.write(data) # sometimes I get UnboundLocalError
                else: print('no data')
                # was the file saved or not
                if os.path.exists(file_path):
                    newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more 
                else: 
                    time.sleep(.5)
    return newpages

count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
    # THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
    while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
        count = 0 #reset counter
        success = False #reset success; new pages downloaded successfully
        # the main delay counter
        # what happens if diff is [] and no success
        while True:
                count += 1
                if count > 14:
                    print('Time out after more than 10 seconds.')
                    break
                    
                li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
                # it is possible that li will not have some of the old requests and oldli will be longer
                # well, I need to keep all old requests in a separate list and then append to it
                diff = list(set(li)-set(oldli)) # find new requests after the delay
                
                if diff: # there are some new
                    npages = savepages(diff) # saves new images and returns the number of them
                    print('newpages ',npages, ' len diff ', len(diff)) # should be equal
                    if npages >= len(diff)-1: # we allow one request without a body with data ??
                        pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
                        success = True # we call it a success
                    else:
                        print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
                        for pg in diff:
                            print(pg) # for debuging purposes
                        break # in this case you break from the delay cycle
                else: time.sleep(2) # if no new requests add 2 sec to the waiting time
                
                if success: # we turn pages in case of successful download, this is bad if we need to catch up
                    while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
                        # it won't wait for the bodies of requests, there is a problem
                        driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
                        turns += 1
                        time.sleep(3) # I got the impression that this doesn't happen
                    oldli = li
                    print('pages ',pages,' length of list ',len(li))
                    break # we break from the delay cycle since success
                    
                time.sleep(2) # the main delay timer;; plus no diff timer = total time

else: print('no requests in the list to process') ```

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'How to download dynamically loaded images using python and seleniumwire?

Sources

Related Questions