'Track download progress of S3 file using boto3 and callbacks
I am trying to download a text file from S3 using boto3.
Here is what I have written.
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
# To simplify we'll assume this is hooked up
# to a single filename.
with self._lock:
self._seen_so_far += bytes_amount
percentage = round((self._seen_so_far / self._size) * 100,2)
LoggingFile('{} is the file name. {} out of {} done. The percentage completed is {} %'.format(str(self._filename), str(self._seen_so_far), str(self._size),str(percentage)))
sys.stdout.flush()
and I am calling it using
transfer.download_file(BUCKET_NAME,FILE_NAME,'{}{}'.format(LOCAL_PATH_TEMP , FILE_NAME),callback = ProgressPercentage(LOCAL_PATH_TEMP + FILE_NAME))
this is giving me a error that file is not present in the folder. Apparently when I already have a file with this name in the same folder it works but when I am downloading a fresh file , it errors out.
What is correction I need to make?
Solution 1:[1]
Install progressbar with pip3 install progressbar
import boto3, os
import progressbar
bucket_name = "<your-s3-bucket-name>"
folder_name = "<your-directory-name-locally>"
file_name = "<your-filename-locally>"
path = folder_name + "/" + file_name
s3 = boto3.client('s3', aws_access_key_id="<your_aws_access_key_id>", aws_secret_access_key="<your_aws_secret_access_key>")
statinfo = os.stat(file_name)
up_progress = progressbar.progressbar.ProgressBar(maxval=statinfo.st_size)
up_progress.start()
def upload_progress(chunk):
up_progress.update(up_progress.currval + chunk)
s3.upload_file(file_name, bucket_name, path, Callback=upload_progress)
up_progress.finish()
Solution 2:[2]
This is my implementation. No other dependencies, hack up the progress callback function to display whatever you want.
import sys
import boto3
s3_client = boto3.client('s3')
def download(local_file_name, s3_bucket, s3_object_key):
meta_data = s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
total_length = int(meta_data.get('ContentLength', 0))
downloaded = 0
def progress(chunk):
nonlocal downloaded
downloaded += chunk
done = int(50 * downloaded / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
print(f'Downloading {s3_object_key}')
with open(local_file_name, 'wb') as f:
s3_client.download_fileobj(s3_bucket, s3_object_key, f, Callback=progress)
e.g.
local_file_name = 'test.csv'
s3_bucket = 'my-bucket'
s3_object_key = 'industry/test.csv'
download(local_file_name, s3_bucket, s3_object_key)
Demo:
Tested with boto3>=1.14.19, python>=3.7
Solution 3:[3]
Following the official document, it is not quite difficult to apply progress tracking (download_file and upload_file functions are similar). Here is the full code with some modifications to see the data size in preferred manner.
import logging
import boto3
from botocore.exceptions import ClientError
import os
import sys
import threading
import math
ACCESS_KEY = 'xxx'
SECRET_KEY = 'xxx'
REGION_NAME= 'ap-southeast-1'
class ProgressPercentage(object):
def __init__(self, filename, filesize):
self._filename = filename
self._size = filesize
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
def convertSize(size):
if (size == 0):
return '0B'
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size,1024)))
p = math.pow(1024,i)
s = round(size/p,2)
return '%.2f %s' % (s,size_name[i])
# To simplify, assume this is hooked up to a single filename
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%) " % (
self._filename, convertSize(self._seen_so_far), convertSize(self._size),
percentage))
sys.stdout.flush()
def download_file(file_name, object_name, bucket_name):
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Initialize s3 client
s3_client = boto3.client(service_name="s3",
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
region_name=REGION_NAME)
try:
response = s3_client.download_file(
Bucket=bucket_name,
Key=object_name,
Filename=file_name,
Callback=ProgressPercentage(file_name, (s3_client.head_object(Bucket=bucket_name, Key=object_name))["ContentLength"])
)
except ClientError as e:
logging.error(e)
return False
return True
file_name = "./output.csv.gz"
bucket_name = "mybucket"
object_name = "result/output.csv.gz"
download_file(file_name, object_name, bucket_name )
Solution 4:[4]
The object client.head_object(Bucket=bucket, Key=filename) is a dict. The file size can be accessed using ['ContentLength'].
Hence the code:self._size = client.head_object(Bucket=bucket, Key=filename).ContentLength
should become:self._size = float(client.head_object(Bucket=bucket, Key=filename)['ContentLength'])
Then it works. Thanks!
Solution 5:[5]
Someone may stumble upon this answer when trying to do this (As per the question title). The easiest way I know to show s3 upload progress:
import a progress bar library into your project. This is what I used: https://github.com/anler/progressbar
Then:
import progressbar
from hurry.filesize import size
import boto3
bucket = "my-bucket-name"
s3_client = boto3.resource('s3')
...
...
# you get the filesize from wherever you have the file on. your system maybe?
filesize = size(file)
up_progress = progressbar.AnimatedProgressBar(end=filesize, width=50)
def upload_progress(chunk):
up_progress + chunk # Notice! No len()
up_progress.show_progress()
s3_client.meta.client.upload_file(file, bucket, s3_file_name, Callback=upload_progress)
The important thing to notice here is the use of the Callback parameter(capital C). It basically returns the number of bytes uploaded to s3. So if you know the original filesize, some simple math gets you a progress bar. You can then use any progress bar library.
Solution 6:[6]
Info
- Credits to
@Kshitij Marwah,@yummiesandnicolas.f.gposts - Using boto3
1.9.96(dl viapip) - Removed
threading - Changed display format (rewrite line above until dl completed)
- Posting because difference b/w online doc and downloaded package
code
class ProgressPercentage(object):
def __init__(self, o_s3bucket, key_name):
self._key_name = key_name
boto_client = o_s3bucket.meta.client
# ContentLength is an int
self._size = boto_client.head_object(Bucket=o_s3bucket.name, Key=key_name)['ContentLength']
self._seen_so_far = 0
sys.stdout.write('\n')
def __call__(self, bytes_amount):
self._seen_so_far += bytes_amount
percentage = (float(self._seen_so_far) / float(self._size)) * 100
TERM_UP_ONE_LINE = '\033[A'
TERM_CLEAR_LINE = '\033[2K'
sys.stdout.write('\r' + TERM_UP_ONE_LINE + TERM_CLEAR_LINE)
sys.stdout.write('{} {}/{} ({}%)\n'.format(self._key_name, str(self._seen_so_far), str(self._size), str(percentage)))
sys.stdout.flush()
Then called it like that
Note the capital C on Callback (that differs from online doc)
progress = ProgressPercentage(o_s3bucket, key_name)
o_s3bucket.download_file(key_name, full_local_path, Callback=progress)
where o_s3bucket is :
bucket_name = 'my_bucket_name'
aws_profile = 'default' # this is used to catch creds from .aws/credentials ini file
boto_session = boto3.session.Session(profile_name=aws_profile)
o_s3bucket = boto_session.resource('s3').Bucket(bucket_name)
hth
Solution 7:[7]
Here is an option I've found useful for with the use of click (just run pip install click before applying code below) library:
import click
import boto3
import os
file_path = os.path.join('tmp', 'file_path')
s3_client = boto3.resource('s3')
with click.progressbar(length=os.path.getsize(file_path)) as progress_bar:
with open(file_path, mode='rb') as upload_file:
s3_client.upload_fileobj(
upload_file,
'bucket_name',
'foo_bar',
Callback=progress_bar.update
)
Solution 8:[8]
Here's another simple custom implementation using tqdm:
from tqdm import tqdm
import boto3
def s3_download(s3_bucket, s3_object_key, local_file_name, s3_client=boto3.client('s3')):
meta_data = s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
total_length = int(meta_data.get('ContentLength', 0))
with tqdm(total=total_length, desc=f'source: s3://{s3_bucket}/{s3_object_key}', bar_format="{percentage:.1f}%|{bar:25} | {rate_fmt} | {desc}", unit='B', unit_scale=True, unit_divisor=1024) as pbar:
with open(local_file_name, 'wb') as f:
s3_client.download_fileobj(s3_bucket, s3_object_key, f, Callback=pbar.update)
usage:
s3_download(bucket, key, local_file_name)
output:
100.0%|????????????????????????? | 12.9MB/s | source: s3://my-bucket/my-key
Solution 9:[9]
Here is code
try:
import logging
import boto3
from botocore.exceptions import ClientError
import os
import sys
import threading
import math
import re
from boto3.s3.transfer import TransferConfig
except Exception as e:
pass
ACCESS_KEY = 'XXXXXXXXXXXXXXXXX'
SECRET_KEY = 'XXXXXXXXXXXXXXXX'
REGION_NAME= 'us-east-1'
BucketName = "XXXXXXXXXXXXXXXX"
KEY = "XXXXXXXXXXXXXXXX"
class Size:
@staticmethod
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
class ProgressPercentage(object):
def __init__(self, filename, filesize):
self._filename = filename
self._size = filesize
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
def convertSize(size):
if (size == 0):
return '0B'
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size,1024)))
p = math.pow(1024,i)
s = round(size/p,2)
return '%.2f %s' % (s,size_name[i])
# To simplify, assume this is hooked up to a single filename
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%) " % (
self._filename, convertSize(self._seen_so_far), convertSize(self._size),
percentage))
sys.stdout.flush()
class AWSS3(object):
"""Helper class to which add functionality on top of boto3 """
def __init__(self, bucket, aws_access_key_id, aws_secret_access_key, region_name):
self.BucketName = bucket
self.client = boto3.client(
"s3",
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
)
def get_size_of_files(self, Key):
response = self.client.head_object(Bucket=self.BucketName, Key=Key)
size = response["ContentLength"]
return {"bytes": size, "size": Size.convert_size(size)}
def put_files(self, Response=None, Key=None):
"""
Put the File on S3
:return: Bool
"""
try:
response = self.client.put_object(
ACL="private", Body=Response, Bucket=self.BucketName, Key=Key
)
return "ok"
except Exception as e:
print("Error : {} ".format(e))
return "error"
def item_exists(self, Key):
"""Given key check if the items exists on AWS S3 """
try:
response_new = self.client.get_object(Bucket=self.BucketName, Key=str(Key))
return True
except Exception as e:
return False
def get_item(self, Key):
"""Gets the Bytes Data from AWS S3 """
try:
response_new = self.client.get_object(Bucket=self.BucketName, Key=str(Key))
return response_new["Body"].read()
except Exception as e:
print("Error :{}".format(e))
return False
def find_one_update(self, data=None, key=None):
"""
This checks if Key is on S3 if it is return the data from s3
else store on s3 and return it
"""
flag = self.item_exists(Key=key)
if flag:
data = self.get_item(Key=key)
return data
else:
self.put_files(Key=key, Response=data)
return data
def delete_object(self, Key):
response = self.client.delete_object(Bucket=self.BucketName, Key=Key,)
return response
def get_all_keys(self, Prefix=""):
"""
:param Prefix: Prefix string
:return: Keys List
"""
try:
paginator = self.client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=self.BucketName, Prefix=Prefix)
tmp = []
for page in pages:
for obj in page["Contents"]:
tmp.append(obj["Key"])
return tmp
except Exception as e:
return []
def print_tree(self):
keys = self.get_all_keys()
for key in keys:
print(key)
return None
def find_one_similar_key(self, searchTerm=""):
keys = self.get_all_keys()
return [key for key in keys if re.search(searchTerm, key)]
def __repr__(self):
return "AWS S3 Helper class "
def download_file(self,file_name, object_name):
try:
response = self.client.download_file(
Bucket=self.BucketName,
Key=object_name,
Filename=file_name,
Config=TransferConfig(
max_concurrency=10,
use_threads=True
),
Callback=ProgressPercentage(file_name,
(self.client.head_object(Bucket=self.BucketName,
Key=object_name))["ContentLength"])
)
except ClientError as e:
logging.error(e)
return False
return True
helper = AWSS3(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, bucket=BucketName, region_name='us-east-1')
helper.download_file(file_name='test.zip', object_name=KEY)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | |
| Solution 2 | |
| Solution 3 | |
| Solution 4 | nicolas.f.g |
| Solution 5 | Emmanuel N K |
| Solution 6 | Boop |
| Solution 7 | Andriy Ivaneyko |
| Solution 8 | shredPilot |
| Solution 9 | Soumil Nitin Shah |

