'How to give custom name to images when downloading through scrapy
This is my program to download images through image pipeline. It works well and download images but the problem ** is it rename images in sha1 hash after that I am unable to identify them. Can there be any solution so that I can use the **model_name as of the images to be download?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
class CompItem(scrapy.Item):
model_name = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "buysmaart_images"
allowed_domains = ["http://buysmaart.com/"]
start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4", "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye", "http://buysmaart.com/productdetails/506/OPPO-N1", "http://buysmaart.com/productdetails/342/LG-G2-D802T"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
time.sleep(8)
sel = Selector(text=self.browser.page_source)
item = CompItem()
photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
print len(photos)
all_photo_urls = []
for photo in photos:
item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
#tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
image_url = photo.xpath('.//img/@src').extract()[0]
all_photo_urls.append(image_url)
item['image_urls'] = all_photo_urls
yield item
pipeline
from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings
BOT_NAME = 'download_images'
SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'
Solution 1:[1]
Scrapy 1.3.3 solution(override image_downloaded methods):
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
Solution 2:[2]
The solution is to override the image_key method of your DownloadImagesPipeline class.
def image_key(self, url):
return 'image_name.here'
For example if you want the image name of the URL you can use
url.split('/')[-1]
as the name of the image. Note that this method is deprecated and can be removed in a future release.
Alternatively you can set the image_name for your image in your Spider:
item['image_name'] = ['whatever_you_want']
In this case you have to extend your pipeline a bit more to utilize the name of the image you provided:
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
And of course your pipeline should extend ImagesPipeline.
Solution 3:[3]
It will give answer for custom image names as well as to which folder (custom named) such images to be saved.
#spider.py
import scrapy
from ..items import DusharaItem
class DusharaSpider(scrapy.Spider):
name='dushara'
start_urls=['https://www.indiaglitz.com/dushara-photos-tamil-actress-3129970-8771']
def parse(self,response):
selector = response.xpath('//div[@class="gallmain gallerycontainer-8771"]/div[@class="gallery_detail gal-8771"]')
for sel in selector:
item = DusharaItem()
item['image_urls'] = sel.xpath('./img/@src').extract_first()
#item['image_urls'] = [sel.xpath('./img/@src').extract_first()] # for default scraping process
item['folder_names_1'] = 'Actress'
item['folder_names_2'] = 'Tamil'
item['image_names'] = sel.xpath('./img/@src').extract_first().split('/')[-1] # it should contain image extension like .jpg
yield item
#items.py
import scrapy
class DusharaItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
folder_names_1 = scrapy.Field()
folder_names_2 = scrapy.Field()
image_names = scrapy.Field()
#pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DusharaPipeline(ImagesPipeline):
def get_media_requests(self, item,info):
url = item['image_urls']
folder_names_1 = item['folder_names_1']
folder_names_2 = item['folder_names_2']
image_names = item['image_names']
yield scrapy.Request(url=url, meta={'folder_names_1': folder_names_1, 'folder_names_2': folder_names_2, 'image_names': image_names})
def file_path(self, request, response=None, info=None, *, item=None):
folder_names_1 = request.meta['folder_names_1']
folder_names_2 = request.meta['folder_names_2']
image_names = request.meta['image_names']
return '/'+folder_names_1+'/'+folder_names_2+'/'+image_names
#settings.py
ITEM_PIPELINES = {'dushara.pipelines.DusharaPipeline': 300}
#ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # for default scraping process
IMAGES_STORE = r'D:\Scraped'
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Devin |
| Solution 2 | GHajba |
| Solution 3 | moorthypnt |
