'How to download dynamically loaded images using python and seleniumwire?
First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.
from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options() # [!]
opts.add_experimental_option('w3c', True) # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)
url = ' here comes url'
driver.get(url)
def get_requests():
l = []
for rx in driver.requests:
#endmark = '&scale=2&rotate=0' lenght must be 17
if rx.url[-17:]==endmark:
l.append(rx.url)
return list(set(l))
def savepages(diff):
newpages = 0
for urlitem in diff:
for request in driver.requests:
if request.url==urlitem:
#print(request.url)
ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
tik = 0
while tik<10: #waiting for the response body data
try:
tik += 1
data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
except AttributeError: # no data error
time.sleep(2) # wait for 2 sec for the data
continue
#data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
if data:
with open(file_path, 'wb') as outfile:
outfile.write(data) # sometimes I get UnboundLocalError
else: print('no data')
# was the file saved or not
if os.path.exists(file_path):
newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more
else:
time.sleep(.5)
return newpages
count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
# THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
count = 0 #reset counter
success = False #reset success; new pages downloaded successfully
# the main delay counter
# what happens if diff is [] and no success
while True:
count += 1
if count > 14:
print('Time out after more than 10 seconds.')
break
li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
# it is possible that li will not have some of the old requests and oldli will be longer
# well, I need to keep all old requests in a separate list and then append to it
diff = list(set(li)-set(oldli)) # find new requests after the delay
if diff: # there are some new
npages = savepages(diff) # saves new images and returns the number of them
print('newpages ',npages, ' len diff ', len(diff)) # should be equal
if npages >= len(diff)-1: # we allow one request without a body with data ??
pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
success = True # we call it a success
else:
print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
for pg in diff:
print(pg) # for debuging purposes
break # in this case you break from the delay cycle
else: time.sleep(2) # if no new requests add 2 sec to the waiting time
if success: # we turn pages in case of successful download, this is bad if we need to catch up
while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
# it won't wait for the bodies of requests, there is a problem
driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
turns += 1
time.sleep(3) # I got the impression that this doesn't happen
oldli = li
print('pages ',pages,' length of list ',len(li))
break # we break from the delay cycle since success
time.sleep(2) # the main delay timer;; plus no diff timer = total time
else: print('no requests in the list to process') ```
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
