'How to get all download links from unsplash using Selenium?
Trying to download a collection of images from Unsplash.
When I try to see the len(links) I only get 29 while it should be 63.
Not sure what the issue is:
from selenium import webdriver
def driver_download(location_for_download):
# options = Options()
# options.headless = True
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': location_for_download}
chrome_options.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
return driver
url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'
driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')
#I have clicked Load more images, all images are showing on page.
driver.get(url)
x = driver.find_elements_by_tag_name('a')
count = 0
for i in x:
if i.get_attribute('title') == 'Download photo':
count+=1
I have tried scrolling to bottom of the page and middle. Still the same number of results.
Solution 1:[1]
This website uses the GET method to get JSON data for every 10 pictures. I'm not familiar with Python but I'll give you the R script for you to translate to Python. You don't need Selenium for this site though.
library(rvest)
library(stringr)
library(rjson)
all_links <- character()
for (i in 1:7) {
url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
pg <- fromJSON(file = url)
links <- character()
for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
all_links <- c(all_links, links)
}
Basically, the idea is you get the JSON file and the download links would be at the $link$download nodes of each item.
Solution 2:[2]
So I worked on it a bit more, and below is the working script.
It is not the best way to go about it.
There is 1 step which still requires a click from user. Can this be automated?
import os
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def driver_download(location_for_download):
# options = Options()
# options.headless = True
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': location_for_download}
chrome_options.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
return driver
def get_picture_links(url, location):
# Check if location exists, if not create the location
if os.path.isdir(location) == False:
os.mkdir(location)
driver = driver_download()
driver.maximize_window()
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
count = 0
links = []
for i in range(7):
if count == 0:
time.sleep(4)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
else:
if count == 1:
# Click the button
time.sleep(4)
input('Please click Load More Photos')
body = driver.find_element_by_css_selector('body')
body.send_keys(Keys.PAGE_DOWN)
time.sleep(5)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
links = list(set(links))
print('Found: %s Pictures to Download.' % str(len(links)))
driver.quit()
return links
def get_pictures(location):
print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
driver = driver_download(location)
for link in links:
time.sleep(4)
driver.get(link)
time.sleep(20)
driver.quit()
print('Pictures have been downloaded..Renaming now')
def rename_pictures(location):
# Rename the files
os.chdir(location)
files = os.listdir()
files = [i for i in files if '.jpg' or '.jpeg' in i]
count = 1
for i in files:
os.rename(i, str(count) + '.jpg')
count += 1
print('Everything done! Open the folder to see the files')
location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)
Solution 3:[3]
You can change the else where the user click is located for this. I use from "selenium.webdriver.common.by import By" but u can change it to your format: driver.find_element_by_xpath('/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
else:
if count == 1:
driver.find_element(By.XPATH, '/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
time.sleep(4)
body = driver.find_element_by_css_selector('body')
body.send_keys(Keys.PAGE_DOWN)
time.sleep(5)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | |
| Solution 2 | halfer |
| Solution 3 | user18771498 |
