'Stream download S3 files, zip them, and stream the zip file back to S3 - Python

People upload files to s3 bucket and I need to be able to programmatically zip certain files.

I am doing this using Fargate and a lot of times all the files that need to be zipped are over 300GB in aggregate. Therefore, it is important that the files are streamed from S3 and the zip file is streamed back to S3 as there is not enough disk space or memory to hold everything at once.

I have found two answers here on StackOverflow but neither has worked and I am not been able to figure out why after trying to troubleshoot.

The first is:

from io import RawIOBase
from zipfile import ZipFile
from zipfile import ZipInfo
from zipfile import ZIP_DEFLATED

import boto3

session = boto3.Session(aws_access_key_id='x', aws_secret_access_key='x', region_name='us-east-2')

s3 = boto3.client('s3')
bucket_name = 'x'

class UnseekableStream(RawIOBase):
    def __init__(self):
        self._buffer = b''

    def writable(self):
        return True

    def write(self, b):
        if self.closed:
            raise ValueError('The stream was closed!')
        self._buffer += b
        return len(b)

    def get(self):
        chunk = self._buffer
        self._buffer = b''
        return chunk

def zipfile_generator(path, stream):
   with ZipFile(stream, mode='w') as zip_archive:
        z_info = ZipInfo.from_file(path)
        z_info.compress_type = ZIP_DEFLATED
        with open(path, 'rb') as entry, zip_archive.open(z_info, mode='w') as dest:
            for chunk in iter(lambda: entry.read(16384), b''):
                dest.write(chunk)
                yield stream.get()
        yield stream.get()

items_to_zip = ['file1.jpg', 'file2.jpg', 'file3.jpg']

stream = UnseekableStream()
with open("test.zip", "wb") as f:
    for file in items_to_zip:
        obj = s3.get_object(Bucket=bucket_name, Key=file)
        for i in zipfile_generator(obj.get(obj), stream):
            f.write(i)
            f.flush()
stream.close()
f.close()

This one gives me an error saying: for i in zipfile_generator(obj.get(obj), stream): TypeError: unhashable type: 'dict'

The second is:

import boto3
import smart_open
from smart_open import s3

session = boto3.Session()
source_bucket_name = "x"
bucket = session.resource('s3').Bucket(source_bucket_name)
prefix = "xx" # s3 prefix for the files under a "folder"

output_path = "s3://xx/streamedzip.zip"

with smart_open.open(output_path, 'wb') as fout:
    for key, content in s3.iter_bucket(source_bucket_name, prefix = prefix):
        fout.write(content)

This one uploads a file back to S3 but it appears to be a corrupted zip file.

I am lost as to where to go from here.

Much thanks



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source