diff --git a/.gitignore b/.gitignore index 76f26c1..49a26e1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ params.yaml **/__pycache__/ # Ignore pytest debug log file -pytestdebug.log \ No newline at end of file +pytestdebug.log + +# Generate Files +/tmp_files/* diff --git a/docs/big_objects_test.py b/docs/big_objects_test.py new file mode 100644 index 0000000..6c0fce2 --- /dev/null +++ b/docs/big_objects_test.py @@ -0,0 +1,88 @@ +import pytest +import logging +from utils.utils import create_big_file, convert_unit +from utils.crud import fixture_bucket_with_name, fixture_upload_multipart_file +from boto3.s3.transfer import TransferConfig +import uuid +from tqdm import tqdm +import os + + +size_list = [ + {'size': 10, 'unit': 'mb'}, + {'size': 100, 'unit': 'mb'}, + {'size': 1, 'unit': 'gb'}, + {'size': 5, 'unit': 'gb'}, + {'size': 10, 'unit': 'gb'}, +] + +ids_list = [f"{s['size']}{s['unit']}" for s in size_list] + +upload_params = [ + { + 'file_path': f"./tmp_files/big_file_download{size['size']}{size['unit']}", + 'file_size': size, + 'object_key': "big-object-" + uuid.uuid4().hex[:6], + } + for size in size_list +] + +@pytest.mark.parametrize( + 'params, fixture_upload_multipart_file', + [(p, p) for p in upload_params], + ids=ids_list, + indirect=['fixture_upload_multipart_file'] +) + +# ## Test multipart download while implicitly tests the upload and delete of big objects + +@pytest.mark.slow +@pytest.mark.big_objects +def test_multipart_download(s3_client, fixture_bucket_with_name, fixture_upload_multipart_file, params): + """ + Test to download a big object to an S3 bucket using multipart download + :param s3_client: fixture of boto3 s3 client + :param fixture_bucket_with_name: fixture to create a bucket with a unique name + :param params: dict: 'file_path': str, 'file_size': dict, 'object_key': str + :return: None + """ + + # Unpacking params + file_path = params.get('file_path') + download_path = file_path + "_downloaded" + object_key = params.get('object_key') + + bucket_name = fixture_bucket_with_name + total_size = create_big_file(file_path, params.get('file_size')) + + + # Config for multhreading of boto3 building multipart upload/download + config = TransferConfig( + multipart_threshold=40 * 1024 * 1024, + max_concurrency=10, + multipart_chunksize=8 * 1024 * 1024, + use_threads=True + ) + + # Uploading the big file + uploaded_file_size = fixture_upload_multipart_file + + + # Test download file from s3 bucket + try: + # Graphing the download progress + with tqdm(total=total_size, + desc=bucket_name, + bar_format="Download| {percentage:.1f}%|{bar:25} | {rate_fmt} | Time: {elapsed} | {desc}", + unit='B', + unit_scale=True, unit_divisor=1024) as pbar: + + s3_client.download_file(Bucket=bucket_name, Key=object_key, Filename = download_path, Config=config, Callback=pbar.update) + + # Retrieving sizes + downloaded_file_size = os.path.getsize(download_path) + + # The test was successful only if the size on the bucket size is equal to the ones uploaded and downloaded + assert downloaded_file_size == uploaded_file_size, f"Downloaded size doesn't match: {downloaded_file_size} with Upload size: {uploaded_file_size}" + except Exception as e: + logging.error(f"Error uploading object {object_key}: {e}") diff --git a/docs/utils/crud.py b/docs/utils/crud.py index bc75d6b..87e5fe4 100644 --- a/docs/utils/crud.py +++ b/docs/utils/crud.py @@ -1,8 +1,10 @@ import logging import pytest from concurrent.futures import ThreadPoolExecutor, as_completed -from utils.utils import generate_valid_bucket_name +from utils.utils import generate_valid_bucket_name, convert_unit +from boto3.s3.transfer import TransferConfig import os +from tqdm import tqdm ### Functions @@ -222,6 +224,7 @@ def fixture_bucket_with_name(s3_client, request): # This fixtures automatically creates a bucket based on the name of the test that called it and then returns its name # Lastly, teardown the bucket by deleting it and its objects + # request.node get the name of the test currently running bucket_name = generate_valid_bucket_name(request.node.name.replace("_", "-")) create_bucket(s3_client, bucket_name) @@ -249,3 +252,39 @@ def fixture_upload_multiple_objects(s3_client, fixture_bucket_with_name, request objects_names = [{"key": f"multiple-object'-{i}", "path": path} for i in range(qnt)] return upload_objects_multithreaded(s3_client, fixture_bucket_with_name, objects_names) +@pytest.fixture +def fixture_upload_multipart_file(s3_client, fixture_bucket_with_name, request) -> int: + """ + Uploads a big file into multiple chunks to s3 bucket + :param s3_client: boto3 s3 client + :param fixture_bucket_with_name: pytest.fixture which setup and tears down bucket + :param request: dict: contains file_path, file_size and object_key + :return int: size in bytes of the obejct + """ + bucket_name = fixture_bucket_with_name + file_path = request.param.get('file_path') + file_size = convert_unit(request.param.get('file_size')) + object_key = request.param.get('object_key') + + # Config for multhreading of boto3 building multipart upload/download + config = TransferConfig( + multipart_threshold=8 * 1024 * 1024, # Minimum size to start multipart upload + max_concurrency=10, + multipart_chunksize=8 * 1024 * 1024, + use_threads=True + ) + + # Upload Progress Bar with time stamp + with tqdm(total=file_size, + desc=bucket_name, + bar_format="Upload| {percentage:.1f}%|{bar:25}| {rate_fmt} | Time: {elapsed} | {desc}", + unit='B', + unit_scale=True, unit_divisor=1024) as pbar: + + response = s3_client.upload_file(file_path, bucket_name, object_key, Config=config, Callback=pbar.update) + elapsed = pbar.format_dict['elapsed'] + + # Checking if the object was uploaded + object_size = s3_client.get_object(Bucket=bucket_name, Key=object_key).get('ContentLength', 0) + + return object_size #return int of size in bytes \ No newline at end of file diff --git a/docs/utils/utils.py b/docs/utils/utils.py index 783a51a..e4889b1 100644 --- a/docs/utils/utils.py +++ b/docs/utils/utils.py @@ -1,4 +1,6 @@ import uuid +import os +import logging # Function is responsible to check and format bucket names into valid ones @@ -25,4 +27,53 @@ def generate_valid_bucket_name(base_name="my-unique-bucket"): new_name.append(char) - return "".join(new_name) \ No newline at end of file + return "".join(new_name) + + + +def convert_unit(size = {'size': 100, 'unit': 'mb'}) -> int: + """ + Converts a dict containing a int and a str into a int representing the size in bytes + :param size: dict: {'size': int, 'unit': ('kb', 'mb', 'gb')} + :return: int: value in bytes of size + """ + + units_dict = { + 'kb': 1024, + 'mb': 1024 * 1024, + 'gb': 1024 * 1024 * 1024, + } + + unit = size['unit'].lower() + + # Check if it is a valid unit to be converted + if unit not in units_dict: + raise ValueError(f"Invalid unit: {size['unit']}") + + return size['size'] * units_dict.get(unit) + + + +def create_big_file(file_path: str, size={'size': 100, 'unit': 'mb'}) -> int: + """ + Create a big file with the specified size using a temporary file. + + :param size: dict: A dictionary containing an int 'size' and a str 'unit'. + :yield: str: Path to the temporary file created. + """ + + total_size = convert_unit(size) + + if not os.path.exists('./tmp_files'): + os.mkdir('./tmp_files') + + + if not os.path.exists(file_path): + # Create a file + with open(file_path, 'wb') as f: + f.write(os.urandom(total_size)) + f.close() + + return total_size + + diff --git a/pyproject.toml b/pyproject.toml index 1595151..5acf31b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "pytest-rerunfailures<16.0,>=15.0", "pytest-xdist<4.0.0,>=3.6.1", "pytest-repeat<1.0.0,>=0.9.3", + "tqdm>=4.67.1", ] name = "s3-specs" version = "0.1.0" @@ -50,6 +51,7 @@ markers = [ "bucket_versioning: Bucket Versioning", "cli: Tests using CLI", "multiple_objects: Tests involving operations with multiple objects on the same bucket", + "big_objects: Tests with files bigger than 5 mb", "rapid: quick expected execution magnitude", "regular: regular time expected execution magnitude", "slow: slow expected execution magnitude", diff --git a/uv.lock b/uv.lock index 644b876..26d908b 100644 --- a/uv.lock +++ b/uv.lock @@ -267,7 +267,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "platform_system == 'Darwin'" }, + { name = "appnope", marker = "sys_platform == 'darwin'" }, { name = "comm" }, { name = "debugpy" }, { name = "ipython" }, @@ -958,6 +958,7 @@ dependencies = [ { name = "pytest-rerunfailures" }, { name = "pytest-xdist" }, { name = "requests" }, + { name = "tqdm" }, { name = "uuid" }, ] @@ -978,6 +979,7 @@ requires-dist = [ { name = "pytest-rerunfailures", specifier = ">=15.0,<16.0" }, { name = "pytest-xdist", specifier = ">=3.6.1,<4.0.0" }, { name = "requests", specifier = ">=2.32.3,<3.0.0" }, + { name = "tqdm", specifier = ">=4.67.1" }, { name = "uuid", specifier = ">=1.30,<2.0" }, ] @@ -1062,6 +1064,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/61/cc/58b1adeb1bb46228442081e746fcdbc4540905c87e8add7c277540934edb/tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38", size = 438907 }, ] +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 }, +] + [[package]] name = "traitlets" version = "5.14.3"