increase file limit to 50gb (#182)

sbassam · web-flow · commit ae3f5ccbdb41 · 2025-11-25T02:28:42.000-06:00
diff --git a/src/together/lib/constants.py b/src/together/lib/constants.py
@@ -20,7 +20,7 @@
 
 # Multipart upload constants
 MIN_PART_SIZE_MB = 5  # Minimum part size (S3 requirement)
-TARGET_PART_SIZE_MB = 100  # Target part size for optimal performance
+TARGET_PART_SIZE_MB = 250  # Target part size for optimal performance
 MAX_MULTIPART_PARTS = 250  # Maximum parts per upload (S3 limit)
 MULTIPART_UPLOAD_TIMEOUT = 300  # Timeout in seconds for uploading each part
 MULTIPART_THRESHOLD_GB = 5.0  # threshold for switching to multipart upload
@@ -32,7 +32,7 @@
 NUM_BYTES_IN_GB = 2**30
 
 # maximum number of GB sized files we support finetuning for
-MAX_FILE_SIZE_GB = 25.0
+MAX_FILE_SIZE_GB = 50.1
 
 # expected columns for Parquet files
 PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
diff --git a/src/together/lib/resources/files.py b/src/together/lib/resources/files.py
@@ -11,7 +11,7 @@
 from typing import IO, Any, Dict, List, Tuple, cast
 from pathlib import Path
 from functools import partial
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 
 import httpx
 from tqdm import tqdm
@@ -463,6 +463,22 @@ def _initiate_upload(
                 body=response.text,
             )
 
+    def _submit_part(
+        self,
+        executor: ThreadPoolExecutor,
+        file_handle: IO[bytes],
+        part_info: Dict[str, Any],
+        part_size: int,
+    ) -> Tuple[Future[str], int]:
+        """Submit a single part for upload and return its future and part number."""
+
+        part_number = part_info.get("PartNumber", part_info.get("part_number", 1))
+        file_handle.seek((part_number - 1) * part_size)
+        part_data = file_handle.read(part_size)
+
+        future = executor.submit(self._upload_single_part, part_info, part_data)
+        return future, part_number
+
     def _upload_parts_concurrent(self, file: Path, upload_info: Dict[str, Any], part_size: int) -> List[Dict[str, Any]]:
         """Upload file parts concurrently with progress tracking"""
 
@@ -471,25 +487,32 @@ def _upload_parts_concurrent(self, file: Path, upload_info: Dict[str, Any], part
 
         with ThreadPoolExecutor(max_workers=self.max_concurrent_parts) as executor:
             with tqdm(total=len(parts), desc="Uploading parts", unit="part", disable=bool(DISABLE_TQDM)) as pbar:
-                future_to_part: Dict[Any, int] = {}
-
                 with open(file, "rb") as f:
-                    for part_info in parts:
-                        part_number = part_info.get("PartNumber", part_info.get("part_number", 1))
-                        f.seek((part_number - 1) * part_size)
-                        part_data = f.read(part_size)
+                    future_to_part: Dict[Future[str], int] = {}
+                    part_index = 0
 
-                        future = executor.submit(self._upload_single_part, part_info, part_data)
+                    while part_index < len(parts) and len(future_to_part) < self.max_concurrent_parts:
+                        part_info = parts[part_index]
+                        future, part_number = self._submit_part(executor, f, part_info, part_size)
                         future_to_part[future] = part_number
+                        part_index += 1
+
+                    while future_to_part:
+                        done_future = next(as_completed(future_to_part))
+                        part_number = future_to_part.pop(done_future)
 
-                for future in as_completed(future_to_part):
-                    part_number = future_to_part[future]
-                    try:
-                        etag = future.result()
-                        completed_parts.append({"part_number": part_number, "etag": etag})
-                        pbar.update(1)
-                    except Exception as e:
-                        raise Exception(f"Failed to upload part {part_number}: {e}") from e
+                        try:
+                            etag = done_future.result()
+                            completed_parts.append({"part_number": part_number, "etag": etag})
+                            pbar.update(1)
+                        except Exception as e:
+                            raise Exception(f"Failed to upload part {part_number}: {e}") from e
+
+                        if part_index < len(parts):
+                            part_info = parts[part_index]
+                            future, next_part_number = self._submit_part(executor, f, part_info, part_size)
+                            future_to_part[future] = next_part_number
+                            part_index += 1
 
         completed_parts.sort(key=lambda x: x["part_number"])
         return completed_parts
@@ -834,25 +857,46 @@ async def _upload_parts_concurrent(
 
         with ThreadPoolExecutor(max_workers=self.max_concurrent_parts) as executor:
             with tqdm(total=len(parts), desc="Uploading parts", unit="part", disable=bool(DISABLE_TQDM)) as pbar:
-                # Submit all upload tasks
-                futures: List[Tuple[Any, int]] = []
                 with open(file, "rb") as f:
-                    for part_info in parts:
+                    future_to_part: Dict[asyncio.Future[str], int] = {}
+                    part_index = 0
+
+                    while part_index < len(parts) and len(future_to_part) < self.max_concurrent_parts:
+                        part_info = parts[part_index]
                         part_number = part_info.get("PartNumber", part_info.get("part_number", 1))
                         f.seek((part_number - 1) * part_size)
                         part_data = f.read(part_size)
 
                         future = loop.run_in_executor(executor, self._upload_single_part_sync, part_info, part_data)
-                        futures.append((future, part_number))
-
-                # Collect results
-                for future, part_number in futures:
-                    try:
-                        etag = await future
-                        completed_parts.append({"part_number": part_number, "etag": etag})
-                        pbar.update(1)
-                    except Exception as e:
-                        raise Exception(f"Failed to upload part {part_number}: {e}") from e
+                        future_to_part[future] = part_number
+                        part_index += 1
+
+                    while future_to_part:
+                        done, _ = await asyncio.wait(
+                            tuple(future_to_part.keys()),
+                            return_when=asyncio.FIRST_COMPLETED,
+                        )
+
+                        for done_future in done:
+                            part_number = future_to_part.pop(done_future)
+
+                            try:
+                                etag = await done_future
+                                completed_parts.append({"part_number": part_number, "etag": etag})
+                                pbar.update(1)
+                            except Exception as e:
+                                raise Exception(f"Failed to upload part {part_number}: {e}") from e
+
+                            if part_index < len(parts):
+                                part_info = parts[part_index]
+                                next_part_number = part_info.get("PartNumber", part_info.get("part_number", 1))
+                                f.seek((next_part_number - 1) * part_size)
+                                part_data = f.read(part_size)
+                                future = loop.run_in_executor(
+                                    executor, self._upload_single_part_sync, part_info, part_data
+                                )
+                                future_to_part[future] = next_part_number
+                                part_index += 1
 
         completed_parts.sort(key=lambda x: x["part_number"])
         return completed_parts
diff --git a/src/together/lib/utils/files.py b/src/together/lib/utils/files.py
@@ -7,9 +7,12 @@
 from pathlib import Path
 from traceback import format_exc
 
+from tqdm import tqdm
+
 from together.types import FilePurpose
 from together.lib.constants import (
     MIN_SAMPLES,
+    DISABLE_TQDM,
     NUM_BYTES_IN_GB,
     MAX_FILE_SIZE_GB,
     PARQUET_EXPECTED_COLUMNS,
@@ -356,8 +359,10 @@ def _check_utf8(file: Path) -> Dict[str, Any]:
     """
     report_dict: Dict[str, Any] = {}
     try:
+        # Dry-run UTF-8 decode by iterating through the file to avoid loading it entirely into memory
         with file.open(encoding="utf-8") as f:
-            f.read()
+            for _ in f:
+                pass
         report_dict["utf8"] = True
     except UnicodeDecodeError as e:
         report_dict["utf8"] = False
@@ -453,7 +458,12 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
     with file.open() as f:
         idx = -1
         try:
-            for idx, line in enumerate(f):
+            for idx, line in tqdm(
+                enumerate(f),
+                desc="Validating file",
+                unit=" lines",
+                disable=bool(DISABLE_TQDM),
+            ):
                 json_line = json.loads(line)
 
                 if not isinstance(json_line, dict):
@@ -473,7 +483,7 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
                         if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
                             if current_format is None:
                                 current_format = possible_format
-                            elif current_format != possible_format: # type: ignore[unreachable]
+                            elif current_format != possible_format:  # type: ignore[unreachable]
                                 raise InvalidFileFormatError(
                                     message="Found multiple dataset formats in the input file. "
                                     f"Got {current_format} and {possible_format} on line {idx + 1}.",
@@ -522,7 +532,7 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
 
                     if dataset_format is None:
                         dataset_format = current_format
-                    elif current_format != dataset_format: # type: ignore[unreachable]
+                    elif current_format != dataset_format:  # type: ignore[unreachable]
                         raise InvalidFileFormatError(
                             message="All samples in the dataset must have the same dataset format. "
                             f"Got {dataset_format} for the first line and {current_format} "
diff --git a/tests/unit/test_multipart_upload_manager.py b/tests/unit/test_multipart_upload_manager.py
@@ -0,0 +1,46 @@
+import math
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from together.lib.constants import NUM_BYTES_IN_GB, MAX_FILE_SIZE_GB, TARGET_PART_SIZE_MB
+from together.lib.types.error import FileTypeError
+from together.lib.resources.files import MultipartUploadManager, _calculate_parts
+
+
+def test_calculate_parts_medium_file():
+    """Ensure 500MB files are split into two 250MB parts."""
+
+    file_size = 500 * 1024 * 1024  # 500MB
+    part_size, num_parts = _calculate_parts(file_size)
+
+    expected_part_size = TARGET_PART_SIZE_MB * 1024 * 1024
+
+    assert num_parts == 2
+    assert part_size == expected_part_size
+
+
+def test_calculate_parts_large_file():
+    """Ensure 50GB files respect the 205-part cap with ~250MB chunks."""
+
+    file_size = 50 * 1024 * 1024 * 1024  # 50GB
+    part_size, num_parts = _calculate_parts(file_size)
+
+    expected_parts = math.ceil(file_size / (TARGET_PART_SIZE_MB * 1024 * 1024))  # 50GB / 250MB ~= 205
+
+    assert num_parts == expected_parts
+    assert part_size >= TARGET_PART_SIZE_MB * 1024 * 1024 - (1 * 1024 * 1024)
+
+
+@patch("together.lib.resources.files.os.stat")
+def test_file_size_exceeds_limit_raises_error(mock_stat: MagicMock):
+    """Uploading a file above 50.1GB should raise FileTypeError."""
+
+    mock_stat.return_value.st_size = int((MAX_FILE_SIZE_GB + 1) * NUM_BYTES_IN_GB)
+    manager = MultipartUploadManager(MagicMock())
+
+    with pytest.raises(FileTypeError) as exc_info:
+        manager.upload("/files", Path("too_large.jsonl"), "fine-tune")
+
+    assert "exceeds maximum supported size" in str(exc_info.value)
diff --git a/uv.lock b/uv.lock