Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Big objects #51

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ params.yaml
**/__pycache__/

# Ignore pytest debug log file
pytestdebug.log
pytestdebug.log

# Generate Files
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

o que vc acha de usar o path /tmp do próprio linux? ao invés de uma pasta local no projeto.

/tmp_files/*
88 changes: 88 additions & 0 deletions docs/big_objects_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pytest
import logging
from utils.utils import create_big_file, convert_unit
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

acho que convert_unit não é usada neste arquivo

from utils.crud import fixture_bucket_with_name, fixture_upload_multipart_file
from boto3.s3.transfer import TransferConfig
import uuid
from tqdm import tqdm
import os


size_list = [
{'size': 10, 'unit': 'mb'},
{'size': 100, 'unit': 'mb'},
{'size': 1, 'unit': 'gb'},
{'size': 5, 'unit': 'gb'},
{'size': 10, 'unit': 'gb'},
]

ids_list = [f"{s['size']}{s['unit']}" for s in size_list]

upload_params = [
{
'file_path': f"./tmp_files/big_file_download{size['size']}{size['unit']}",
'file_size': size,
'object_key': "big-object-" + uuid.uuid4().hex[:6],
}
for size in size_list
]

@pytest.mark.parametrize(
'params, fixture_upload_multipart_file',
[(p, p) for p in upload_params],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

voce passa os mesmos parametros tanto para a fixture (indirect) quanto para o teste, que usa a fixture. Será que não daria para usar só a fixture? Faz ela retornar as 3 coisas (path, size, key) ao inves de somente o size

ids=ids_list,
indirect=['fixture_upload_multipart_file']
)

# ## Test multipart download while implicitly tests the upload and delete of big objects

@pytest.mark.slow
@pytest.mark.big_objects
def test_multipart_download(s3_client, fixture_bucket_with_name, fixture_upload_multipart_file, params):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vamos fazer este teste ser executável via jupyter notebook? acho que da para pegar o exemplo de como estão outros, mas posso te mostrar na reunião também

"""
Test to download a big object to an S3 bucket using multipart download
:param s3_client: fixture of boto3 s3 client
:param fixture_bucket_with_name: fixture to create a bucket with a unique name
:param params: dict: 'file_path': str, 'file_size': dict, 'object_key': str
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nao esquece de ajustar aqui caso aceite a sugestão anterior

:return: None
"""

# Unpacking params
file_path = params.get('file_path')
download_path = file_path + "_downloaded"
object_key = params.get('object_key')

bucket_name = fixture_bucket_with_name
total_size = create_big_file(file_path, params.get('file_size'))


# Config for multhreading of boto3 building multipart upload/download
config = TransferConfig(
multipart_threshold=40 * 1024 * 1024,
max_concurrency=10,
multipart_chunksize=8 * 1024 * 1024,
use_threads=True
)

# Uploading the big file
uploaded_file_size = fixture_upload_multipart_file


# Test download file from s3 bucket
try:
# Graphing the download progress
with tqdm(total=total_size,
desc=bucket_name,
bar_format="Download| {percentage:.1f}%|{bar:25} | {rate_fmt} | Time: {elapsed} | {desc}",
unit='B',
unit_scale=True, unit_divisor=1024) as pbar:

s3_client.download_file(Bucket=bucket_name, Key=object_key, Filename = download_path, Config=config, Callback=pbar.update)

# Retrieving sizes
downloaded_file_size = os.path.getsize(download_path)

# The test was successful only if the size on the bucket size is equal to the ones uploaded and downloaded
assert downloaded_file_size == uploaded_file_size, f"Downloaded size doesn't match: {downloaded_file_size} with Upload size: {uploaded_file_size}"
except Exception as e:
logging.error(f"Error uploading object {object_key}: {e}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

acho que é downloading nesta msg né?

41 changes: 40 additions & 1 deletion docs/utils/crud.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging
import pytest
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.utils import generate_valid_bucket_name
from utils.utils import generate_valid_bucket_name, convert_unit
from boto3.s3.transfer import TransferConfig
import os
from tqdm import tqdm

### Functions

Expand Down Expand Up @@ -222,6 +224,7 @@ def fixture_bucket_with_name(s3_client, request):
# This fixtures automatically creates a bucket based on the name of the test that called it and then returns its name
# Lastly, teardown the bucket by deleting it and its objects

# request.node get the name of the test currently running
bucket_name = generate_valid_bucket_name(request.node.name.replace("_", "-"))
create_bucket(s3_client, bucket_name)

Expand Down Expand Up @@ -249,3 +252,39 @@ def fixture_upload_multiple_objects(s3_client, fixture_bucket_with_name, request
objects_names = [{"key": f"multiple-object'-{i}", "path": path} for i in range(qnt)]
return upload_objects_multithreaded(s3_client, fixture_bucket_with_name, objects_names)

@pytest.fixture
def fixture_upload_multipart_file(s3_client, fixture_bucket_with_name, request) -> int:
"""
Uploads a big file into multiple chunks to s3 bucket
:param s3_client: boto3 s3 client
:param fixture_bucket_with_name: pytest.fixture which setup and tears down bucket
:param request: dict: contains file_path, file_size and object_key
:return int: size in bytes of the obejct
"""
bucket_name = fixture_bucket_with_name
file_path = request.param.get('file_path')
file_size = convert_unit(request.param.get('file_size'))
object_key = request.param.get('object_key')

# Config for multhreading of boto3 building multipart upload/download
config = TransferConfig(
multipart_threshold=8 * 1024 * 1024, # Minimum size to start multipart upload
max_concurrency=10,
multipart_chunksize=8 * 1024 * 1024,
use_threads=True
)

# Upload Progress Bar with time stamp
with tqdm(total=file_size,
desc=bucket_name,
bar_format="Upload| {percentage:.1f}%|{bar:25}| {rate_fmt} | Time: {elapsed} | {desc}",
unit='B',
unit_scale=True, unit_divisor=1024) as pbar:

response = s3_client.upload_file(file_path, bucket_name, object_key, Config=config, Callback=pbar.update)
elapsed = pbar.format_dict['elapsed']

# Checking if the object was uploaded
object_size = s3_client.get_object(Bucket=bucket_name, Key=object_key).get('ContentLength', 0)

return object_size #return int of size in bytes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aqui o que eu comentei de retornar os 3 dados

53 changes: 52 additions & 1 deletion docs/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import uuid
import os
import logging

# Function is responsible to check and format bucket names into valid ones

Expand All @@ -25,4 +27,53 @@ def generate_valid_bucket_name(base_name="my-unique-bucket"):
new_name.append(char)


return "".join(new_name)
return "".join(new_name)



def convert_unit(size = {'size': 100, 'unit': 'mb'}) -> int:
"""
Converts a dict containing a int and a str into a int representing the size in bytes
:param size: dict: {'size': int, 'unit': ('kb', 'mb', 'gb')}
:return: int: value in bytes of size
"""

units_dict = {
'kb': 1024,
'mb': 1024 * 1024,
'gb': 1024 * 1024 * 1024,
}

unit = size['unit'].lower()

# Check if it is a valid unit to be converted
if unit not in units_dict:
raise ValueError(f"Invalid unit: {size['unit']}")

return size['size'] * units_dict.get(unit)



def create_big_file(file_path: str, size={'size': 100, 'unit': 'mb'}) -> int:
"""
Create a big file with the specified size using a temporary file.

:param size: dict: A dictionary containing an int 'size' and a str 'unit'.
:yield: str: Path to the temporary file created.
"""

total_size = convert_unit(size)

if not os.path.exists('./tmp_files'):
os.mkdir('./tmp_files')


if not os.path.exists(file_path):
# Create a file
with open(file_path, 'wb') as f:
f.write(os.urandom(total_size))
f.close()

return total_size


2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"pytest-rerunfailures<16.0,>=15.0",
"pytest-xdist<4.0.0,>=3.6.1",
"pytest-repeat<1.0.0,>=0.9.3",
"tqdm>=4.67.1",
]
name = "s3-specs"
version = "0.1.0"
Expand Down Expand Up @@ -50,6 +51,7 @@ markers = [
"bucket_versioning: Bucket Versioning",
"cli: Tests using CLI",
"multiple_objects: Tests involving operations with multiple objects on the same bucket",
"big_objects: Tests with files bigger than 5 mb",
"rapid: quick expected execution magnitude",
"regular: regular time expected execution magnitude",
"slow: slow expected execution magnitude",
Expand Down
16 changes: 15 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading