From f2905dc5abaca3b7c7296965b569cdea8b49a61e Mon Sep 17 00:00:00 2001 From: makseq Date: Sat, 23 Aug 2025 01:07:38 +0300 Subject: [PATCH 01/26] feat: BROS-353: Databricks storage integration --- .cursor/rules/storage-provider.mdc | 247 ++++++++++++------------- label_studio/io_storages/README.md | 110 ++++++++--- label_studio/io_storages/gcs/README.md | 21 +++ 3 files changed, 220 insertions(+), 158 deletions(-) create mode 100644 label_studio/io_storages/gcs/README.md diff --git a/.cursor/rules/storage-provider.mdc b/.cursor/rules/storage-provider.mdc index c49f8df39da2..5b569569b04a 100644 --- a/.cursor/rules/storage-provider.mdc +++ b/.cursor/rules/storage-provider.mdc @@ -2,87 +2,134 @@ description: How to add a new storage or data connector for Label Studio alwaysApply: false --- -# Cursor Rule: Implementing New Storage Providers in Label Studio +# Implementing New Storage Providers in Label Studio ## Overview -This rule describes the process and best practices for adding a new storage provider to Label Studio using the declarative provider schema system. - -See comprehensive overview about storages @io_storages/README.md. - -## Architecture Overview +This document describes the process and best practices for adding a new storage provider to Label Studio using the declarative provider schema system. Label Studio supports 2 types of cloud storages: 1. **Import Storages** (Source Cloud Storages) - for importing tasks/data 2. **Export Storages** (Target Cloud Storages) - for exporting annotations -### Key Differences Between Storage Types - -| Aspect | Import Storage | Export Storage | -|--------|----------------|----------------| -| **Purpose** | Import tasks FROM cloud storage | Export annotations TO cloud storage | -| **Triggering** | Manual sync via API/UI | Automatic via Django signals | -| **Data Flow** | Storage → Label Studio | Label Studio → Storage | -| **Validation** | Must check prefix exists | No prefix check (auto-created) | -| **Primary Methods** | `iter_objects()`, `get_data()` | `save_annotation()`, `save_annotations()` | -| **Threading** | Single-threaded iteration | Multi-threaded export (max_workers) | - -Each storage type follows this inheritance hierarchy: -```mermaid -graph TD - Storage-->ImportStorage - Storage-->ExportStorage - - ProjectStorageMixin-->NewImportStorage - ImportStorage-->NewImportStorageBase - - NewStorageMixin-->NewExportStorage - ExportStorage-->NewExportStorage +See comprehensive overview about storages @io_storages/README.md. - NewImportStorageBase-->NewImportStorage - - subgraph New Provider - NewImportStorage - NewImportStorageBase - NewExportStorage - NewStorageMixin[NewProviderStorageMixin] - end -``` -## Key Export Storage Insights +## Implementation Checklist + +Follow all steps below to implement a new storage. More details follow after the checklist; review them all. Do it on your best, until all items are done and tests are passing. + +### 1. Exploration and preparation +1. [ ] Carefully read @io_storages/README.md +2. [ ] Search official documentation for the new storage you want to add + - [ ] Determine whether pre-signed URLs are supported, or whether only direct reads are possible + - [ ] Determine whether writes are supported, and how annotations will be stored (objects/blobs, files, rows/strings in a table, etc.) + - [ ] Understand the provider's Python API/SDK, especially how to read, write, and list objects +3. If the requester hasn't specified the target edition, recommend Open Source or Enterprise and confirm the choice +4. Check storage directory structure in `label_studio/io_storages` (or `label_studio_enterprise/lse_io_storages` for Enterprise) and the `s3` (or `s3s` for Enterprise) subfolder +5. [ ] Create the new provider directory structure based on the pattern you observed +6. [ ] Create a README.md file in the new provider folder +7. [ ] Add a brief Overview section about the new storage and your findings from step 2 +8. [ ] Add a section on how to configure the storage from scratch for users unfamiliar with it. Provide clear, correct, up-to-date steps with links to official docs to reduce manual QA time + +### 2. Backend Implementation +4. [ ] Implement storage mixin with common fields: + - [ ] Basic fields: bucket, prefix, regex_filter, use_blob_urls (pre-signed URLs on/off) + - [ ] URL resolution: presign, presign_ttl (if applicable to the storage) + - [ ] Provider credentials: api_key, secret_key, endpoint_url + - [ ] Common methods: get_client(), validate_connection() +5. [ ] Create import storage base class with required methods: + - [ ] `iter_objects()` - iterate over storage objects + - [ ] `get_data()` - load task data from objects + - [ ] `generate_http_url()` - create HTTP URLs + - [ ] `can_resolve_url()` - check URL resolution capability + - [ ] `validate_connection()` - validate credentials and that the prefix contains files +6. [ ] Create export storage class with required methods: + - [ ] `save_annotation()` - save single annotation to storage + - [ ] `delete_annotation()` - delete annotation from storage (optional) + - [ ] `validate_connection()` - validate credentials and bucket access (NO prefix check) +7. [ ] Create non-abstract provider-specific concrete classes for import and export +8. [ ] Implement storage link models: + - [ ] ImportStorageLink for tracking task imports + - [ ] ExportStorageLink for tracking annotation exports +9. [ ] **CRITICAL: Add `app_label = 'io_storages'` to Meta classes** - All concrete storage models (ImportStorage, ExportStorage, and StorageLink classes) must include `app_label = 'io_storages'` in their Meta class to avoid Django app registration errors. This is required because storage providers are in subdirectories of `io_storages` but need to be registered under the main `io_storages` app. **Note**: Enterprise providers do NOT need app_label - see enterprise guide. +10. [ ] Create serializers with validation logic +11. [ ] Implement API views following existing patterns +12. [ ] Register URLs in storage URL configuration +13. [ ] Add signal handlers for auto-export functionality: + - [ ] post_save signal for automatic annotation export + - [ ] pre_delete signal for automatic annotation deletion + - [ ] Async export functions with error handling +14. [ ] Create database migrations +15. [ ] Add basic pytests for newly added API calls + +### 3. Frontend Implementation +1. [ ] Check examples: for Open Source see: `label-studio/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/`, for Enterprise see: `label-studio-enterprise/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/` +2. [ ] Create a provider configuration file in `web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/` with: + - [ ] All required fields with proper types + - [ ] Zod validation schemas + - [ ] Meaningful labels and placeholders + - [ ] Proper field layout definition +3. [ ] Register provider in central registry +4. [ ] Mark credential fields with `accessKey: true` +5. [ ] Test form rendering and validation +6. [ ] Verify edit mode behavior for credentials + +### 4. Testing +- [ ] Write backend unit tests (see @backend-unit-tests.mdc for details) +- [ ] Test connection validation (validate_connection) +- [ ] Test object iteration and filtering (iter_objects) +- [ ] Test task data loading (get_data) +- [ ] Test frontend form functionality + +### 5. Documentation +- [ ] Add provider to storage documentation (docs/source/guide/storage.md) +- [ ] Update API documentation using `@method_decorator` for storage API classes (see @updating-label-studio-sdk.mdc) + +### 6. Integration & Deployment +These steps are for manual QA by the requester; remind them after you finish your work: +- [ ] Test end-to-end storage workflow + - [ ] Create a project, add a new import storage, sync it, and check Data Manager for new files + - [ ] Create a project, add a new export storage, create a few annotations, sync the storage, and check that files appear in the storage admin console +- [ ] Test URL resolution: verify that storage URIs like `s3://xxx/1.jpg` are resolved and load in the editor +- [ ] Test with both presigned URLs and proxy mode +- [ ] Test storage error and status reporting: if there are any errors, a user should be able to click "Failed - View logs" and see an error description + -Based on the implementation patterns in the codebase, here are the critical aspects specific to export storages (target storages): +## Decision: Open Source vs Enterprise -### 1. **Automatic vs Manual Operation** -- **Import storages**: Require manual sync via API calls -- **Export storages**: Automatically triggered by Django signals when annotations are created/updated +**CRITICAL FIRST DECISION**: Where should your new storage provider be implemented? -### 2. **Connection Validation Differences** -- **Import storages**: Must validate that prefix contains files during `validate_connection()` -- **Export storages**: Only validate bucket access, NOT prefix (prefixes are created automatically) +### Add to Open Source (`io_storages`) if: +- Basic authentication (API keys, service accounts) +- Standard file formats (JSON, JSONL, images) +- Community-focused features +- Simple cloud storage connectivity +- User request: the requester explicitly asks for Open Source -### 3. **Data Serialization** -Export storages use `_get_serialized_data()` which returns different formats based on feature flags: -- **Default**: Only annotation data (backward compatibility) -- **With `fflag_feat_optic_650_target_storage_task_format_long`**: Full task + annotations data +### Add to Enterprise (`lse_io_storages`) if you need: +- **Advanced Authentication**: IAM roles, Workload Identity Federation, cross-account access +- **Enterprise Security**: Server-side encryption, ACL controls, audit logging +- **Advanced Data Formats**: Parquet support, complex data transformations +- **Billing Restrictions**: Storage limits, organizational constraints +- **Advanced Review Workflows**: Review-based export triggers +- **User request**: The requester explicitly asks for Enterprise -### 4. **Built-in Threading** -- Export storages inherit `save_annotations()` with built-in parallel processing -- Uses ThreadPoolExecutor with configurable `max_workers` (default: min(8, cpu_count * 4)) -- Includes progress tracking and automatic batch processing +### Key Enterprise Advantages -### 5. **Storage Links & Key Generation** -- **Import links**: Track task imports with custom keys -- **Export links**: Track annotation exports with keys based on feature flags: - - Default: `annotation.id` - - With feature flag: `task.id` + optional `.json` extension +1. **No App Label Issues**: LSE uses proper app configuration, avoiding Django registration conflicts +2. **Advanced Authentication**: Support for IAM roles, WIF, cross-account access +3. **Enhanced Security**: Built-in encryption, ACL controls, audit capabilities +4. **Enterprise Features**: Parquet support, billing controls, review workflows +5. **Better Error Handling**: Enhanced logging, metrics, and monitoring +6. **Scalability**: Client caching, optimized performance patterns + +**Default Recommendation**: Most new storage providers should be added to Enterprise for better security, features, and future extensibility. -### 6. **Optional Deletion Support** -- Export storages can implement `delete_annotation()` -- Controlled by `can_delete_objects` field -- Automatically called when annotations are deleted from Label Studio ## Backend Implementation +Important note: if you implement a storage for Label Studio Enterprise, replace all paths `label_studio/io_storages` with `label_studio_enterprise/lse_io_storages`. + ### 1. Create Storage Models #### File Structure @@ -136,7 +183,7 @@ class YourProviderImportStorage(ProjectStorageMixin, YourProviderImportStorageBa #### Export Storage Class -**Reference Implementation**: Follow `io_storages/s3/models.py` `S3ExportStorage` class +**Reference Implementation**: Follow `io_storages/s3/models.py` `S3ExportStorage` class as an example **Required Class**: ```python @@ -230,8 +277,7 @@ Create API views in `label_studio/io_storages/yourprovider/api.py`: **Key Features**: 1. **OpenAPI Documentation**: Use `@method_decorator` with `extend_schema` for each endpoint -2. **Proper Tags**: Use `['Storage: YourProvider']` and `['Export Storage: YourProvider']` -3. **Queryset & Serializer**: Set `queryset` and `serializer_class` for each view +2. **Queryset & Serializer**: Set `queryset` and `serializer_class` for each view ### 4. Register URLs @@ -254,9 +300,9 @@ path('api/storages/yourprovider/', include(('io_storages.yourprovider.urls', 'io ### Create Provider Configuration -**Reference Implementation**: Follow `web/lib/app-common/src/blocks/StorageProviderForm/providers/s3.ts` +**Reference Implementation**: Follow `web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/s3.ts` -**Create**: `web/lib/app-common/src/blocks/StorageProviderForm/providers/yourprovider.ts` +**Create**: `web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/yourprovider.ts` **Required Structure**: ```ts @@ -285,7 +331,7 @@ const yourProviderProvider: ProviderConfig = { ## Testing -**Reference Implementation**: Follow `label_studio/io_storages/tests/test_s3.py` patterns +**Reference Implementation**: Follow tests in `label_studio/io_storages/tests/` and `label_studio_enterprise/lse_io_storages/tests/`. Useful examples include `test_import_storage_list_files_api.py`, `test_proxy_api.py`, and `test_get_bytes_stream.py`. **Create**: `label_studio/io_storages/tests/test_yourprovider.py` @@ -300,72 +346,10 @@ const yourProviderProvider: ProviderConfig = { - `test_data_loading()` - Test task data loading from storage objects - `test_export_functionality()` - Test annotation export and deletion -## Implementation Checklist - -### Backend Implementation -- [ ] Create provider directory structure -- [ ] Implement storage mixin with common fields: - - [ ] Basic fields: bucket, prefix, regex_filter, use_blob_urls - - [ ] URL resolution: presign, presign_ttl - - [ ] Provider credentials: api_key, secret_key, endpoint_url - - [ ] Common methods: get_client(), validate_connection() -- [ ] Create import storage base class with required methods: - - [ ] `iter_objects()` - iterate over storage objects - - [ ] `get_data()` - load task data from objects - - [ ] `generate_http_url()` - create HTTP URLs - - [ ] `can_resolve_url()` - check URL resolution capability - - [ ] `validate_connection()` - validate credentials and prefix has files -- [ ] Create export storage class with required methods: - - [ ] `save_annotation()` - save single annotation to storage - - [ ] `delete_annotation()` - delete annotation from storage (optional) - - [ ] `validate_connection()` - validate credentials and bucket access (NO prefix check) -- [ ] Create concrete import/export storage classes -- [ ] Implement storage link models: - - [ ] ImportStorageLink for tracking task imports - - [ ] ExportStorageLink for tracking annotation exports -- [ ] **CRITICAL: Add `app_label = 'io_storages'` to Meta classes** - All concrete storage models (ImportStorage, ExportStorage, and StorageLink classes) must include `app_label = 'io_storages'` in their Meta class to avoid Django app registration errors. This is required because storage providers are in subdirectories of `io_storages` but need to be registered under the main `io_storages` app. **Note**: Enterprise providers do NOT need app_label - see enterprise guide. -- [ ] Create serializers with validation logic -- [ ] Implement API views following existing patterns -- [ ] Register URLs in storage URL configuration -- [ ] Add signal handlers for auto-export functionality: - - [ ] post_save signal for automatic annotation export - - [ ] pre_delete signal for automatic annotation deletion - - [ ] Async export functions with error handling -- [ ] Create database migrations -- [ ] Add basic pytests for newly added API calls - -### Frontend Implementation -- [ ] Create provider configuration file with: - - [ ] All required fields with proper types - - [ ] Zod validation schemas - - [ ] Meaningful labels and placeholders - - [ ] Proper field layout definition -- [ ] Register provider in central registry -- [ ] Mark credential fields with `accessKey: true` -- [ ] Test form rendering and validation -- [ ] Verify edit mode behavior for credentials - -### Testing & Documentation -- [ ] Write backend unit tests -- [ ] Test connection validation -- [ ] Test object iteration and filtering -- [ ] Test task data loading -- [ ] Test frontend form functionality -- [ ] Test both create and edit modes -- [ ] Update API documentation -- [ ] Add provider to storage documentation (docs/source/guide/storage.md) - -### Integration & Deployment -- [ ] Test end-to-end storage workflow -- [ ] Verify task import/export functionality -- [ ] Test URL resolution and proxy functionality -- [ ] Test with both presigned URLs and proxy mode -- [ ] Verify error handling and user feedback -- [ ] Test storage sync and status reporting - ## Common Issues & Solutions ### Django App Label Error + **Error**: `RuntimeError: Model class doesn't declare an explicit app_label and isn't in an application in INSTALLED_APPS` **Cause**: Storage provider models in subdirectories (e.g., `io_storages.databricks`) are not automatically recognized as belonging to the `io_storages` app. @@ -394,6 +378,7 @@ class YourProviderImportStorageLink(ImportStorageLink): **Note**: This is required for ALL concrete models (not abstract ones) including storage classes and link models. ### Django Model Conflict Error + **Error**: `RuntimeError: Conflicting 'providernameimportstorage' models in application 'io_storages'` **Cause**: Django is registering the same model through two different import paths: diff --git a/label_studio/io_storages/README.md b/label_studio/io_storages/README.md index d234ca216ab4..017673a2791a 100644 --- a/label_studio/io_storages/README.md +++ b/label_studio/io_storages/README.md @@ -1,21 +1,21 @@ # Cloud Storages -There are 3 basic types of cloud storages: +Cloud storage is used for importing tasks and exporting annotations in Label Studio. There are 2 basic types of cloud storages: 1. Import Storages (aka Source Cloud Storages) 2. Export Storages (aka Target Cloud Storages) -3. Dataset Storages (available in enterprise) Also Label Studio has Persistent storages where LS storage export files, user avatars and UI uploads. Do not confuse `Cloud Storages` and `Persistent Storage`, they have completely different codebase and tasks. Cloud Storages are implemented in `io_storages`, Persistent Storage uses django-storages and it is installed in Django settings environment variables (see `base.py`). +Note: Dataset Storages were implemented in the enterprise codebase only. They are **deprecated and not used**. +## Basic hierarchy +This section uses GCS storage as an example, and the same logic can be applied to other storages. -## Basic hierarchy - -### Import and Dataset Storages +### Import Storages -This diagram is based on Google Cloud Storage (GCS) and other storages are implemented the same way. +This storage type is designed for importing tasks FROM cloud storage to Label Studio. This diagram is based on Google Cloud Storage (GCS), and other storages are implemented in the same way: ```mermaid graph TD; @@ -28,7 +28,7 @@ This diagram is based on Google Cloud Storage (GCS) and other storages are imple GCSImportStorageBase-->GCSImportStorage; GCSImportStorageBase-->GCSDatasetStorage; - DatasetStorageMixin-->GCSDatasetStorage; + GCSImportStorageLink-->ImportStorageLink subgraph Google Cloud Storage GCSImportStorage; @@ -37,7 +37,52 @@ This diagram is based on Google Cloud Storage (GCS) and other storages are imple end ``` +- **Storage** (`label_studio/io_storages/base_models.py`): Abstract base for all storages. Inherits status/progress from `StorageInfo`. Defines `validate_connection()` contract and common metadata fields. + +- **ImportStorage** (`label_studio/io_storages/base_models.py`): Abstract base for source storages. Defines core contracts used by sync and proxy: + - `iter_objects()`, `iter_keys()` to enumerate objects + - `get_unified_metadata(obj)` to normalize provider metadata + - `get_data(key)` to produce `StorageObject`(s) for task creation + - `generate_http_url(url)` to resolve provider URL -> HTTP URL (presigned or direct) + - `resolve_uri(...)` and `can_resolve_url(...)` used by the Storage Proxy + - `scan_and_create_links()` to create `ImportStorageLink`s for tasks + +- **ImportStorageLink** (`label_studio/io_storages/base_models.py`): Link model created per-task for imported objects. Fields: `task` (1:1), `key` (external key), `row_group`/`row_index` (parquet/JSONL indices), `object_exists`, timestamps. Helpers: `n_tasks_linked(key, storage)` and `create(task, key, storage, row_index=None, row_group=None)`. + +- **ProjectStorageMixin** (`label_studio/io_storages/base_models.py`): Adds `project` FK and permission checks. Used by project-scoped storages (e.g., `GCSImportStorage`). + +- **GCSImportStorageBase** (`label_studio/io_storages/gcs/models.py`): GCS-specific import base. Sets `url_scheme='gs'`, implements listing (`iter_objects/iter_keys`), data loading (`get_data`), URL generation (`generate_http_url`), URL resolution checks, and metadata helpers. Reused by both project imports and enterprise datasets. + +- **GCSImportStorage** (`label_studio/io_storages/gcs/models.py`): Concrete project-scoped GCS import storage combining `ProjectStorageMixin` + `GCSImportStorageBase`. +- **GCSImportStorageLink** (`label_studio/io_storages/gcs/models.py`): Provider-specific `ImportStorageLink` with `storage` FK to `GCSImportStorage`. Created during sync to associate a task with the original GCS object key. + +### Export Storages + +This storage type is designed for exporting tasks or annotations FROM Label Studio to cloud storage. + +```mermaid + graph TD; + + Storage-->ExportStorage; + + ProjectStorageMixin-->ExportStorage; + ExportStorage-->GCSExportStorage; + GCSStorageMixin-->GCSExportStorage; + + ExportStorageLink-->GCSExportStorageLink; +``` + +- **ExportStorage** (`label_studio/io_storages/base_models.py`): Abstract base for target storages. Project-scoped; orchestrates export jobs and progress. Key methods: + - `save_annotation(annotation)` provider-specific write + - `save_annotations(queryset)`, `save_all_annotations()`, `save_only_new_annotations()` helpers + - `sync(save_only_new_annotations=False)` background export via RQ + +- **GCSExportStorage** (`label_studio/io_storages/gcs/models.py`): Concrete target storage for GCS. Serializes data via `_get_serialized_data(...)`, computes key via `GCSExportStorageLink.get_key(...)`, uploads to GCS; can auto-export on annotation save when configured. + +- **ExportStorageLink** (`label_studio/io_storages/base_models.py`): Base link model connecting exported objects to `Annotation`s. Provides `get_key(annotation)` logic (task-based or annotation-based via FF) and `create(...)` helper. + +- **GCSExportStorageLink** (`label_studio/io_storages/gcs/models.py`): Provider-specific link model holding FK to `GCSExportStorage`. ## How validate_connection() works @@ -50,32 +95,43 @@ Run this command with try/except: Target storages use the same validate_connection() function, but without any prefix. -## Google Cloud Storage (GCS) +## Key Storage Insights + +### 1. **Primary Methods** +- **Import storages**: `iter_objects()`, `get_data()` +- **Export storages**: `save_annotation()`, `save_annotations()` + +### 2. **Automatic vs Manual Operation** +- **Import storages**: Require manual sync via API calls or UI +- **Export storages**: Manual sync via API/UI | Manual sync via API/UI and automatic via Django signals when annotations are submitted or updated -### Credentials +### 3. **Connection Validation Differences** +- **Import storages**: Must validate that prefix contains files during `validate_connection()` +- **Export storages**: Only validate bucket access, NOT prefix (prefixes are created automatically) -There are two methods for setting GCS credentials: -1. Through the Project => Cloud Storage settings in the Label Studio user interface. -2. Through Google Application Default Credentials (ADC). This involves the following steps: +### 4. **Data Serialization** +Export storages use `_get_serialized_data()` which returns different formats based on feature flags: +- **Default**: Only annotation data (backward compatibility) +- **With `fflag_feat_optic_650_target_storage_task_format_long` or `FUTURE_SAVE_TASK_TO_STORAGE`**: Full task + annotations data instead of annotation per file output. - 2.1. Leave the Google Application Credentials field in the Label Studio UI blank. - - 2.2. Set an environment variable which will apply to all Cloud Storages. This can be done using the following command: - ```bash - export GOOGLE_APPLICATION_CREDENTIALS=google_credentials.json - ``` - 2.3. Alternatively, use the following command: - ```bash - gcloud auth application-default login - ``` - 2.4. Another option is to use credentials provided by the Google App Engine or Google Compute Engine metadata server, if the code is running on either GAE or GCE. +### 5. **Built-in Threading** +- Export storages inherit `save_annotations()` with built-in parallel processing +- Uses ThreadPoolExecutor with configurable `max_workers` (default: min(8, cpu_count * 4)) +- Includes progress tracking and automatic batch processing -Note: If Cloud Storage credentials are set in the Label Studio UI, these will take precedence over other methods. +### 6. **Storage Links & Key Generation** +- **Import links**: Track task imports with custom keys +- **Export links**: Track annotation exports with keys based on feature flags: + - Default: `annotation.id` + - With feature flag: `task.id` + optional `.json` extension - +### 7. **Optional Deletion Support** +- Export storages can implement `delete_annotation()` +- Controlled by `can_delete_objects` field +- Automatically called when annotations are deleted from Label Studio -## Storage statuses and how they are processed +## StorageInfo statuses and how they are processed Storage (Import and Export) have different statuses of synchronization (see `class StorageInfo.Status`): @@ -94,7 +150,7 @@ Storage (Import and Export) have different statuses of synchronization (see `cla InProgress-->Completed; ``` -Additionally, StorageInfo contains counters and debug information that will be displayed in storages: +Additionally, class **StorageInfo** contains counters and debug information that will be displayed in storages: * last_sync - time of the last successful sync * last_sync_count - number of objects that were successfully synced diff --git a/label_studio/io_storages/gcs/README.md b/label_studio/io_storages/gcs/README.md new file mode 100644 index 000000000000..48a947212a22 --- /dev/null +++ b/label_studio/io_storages/gcs/README.md @@ -0,0 +1,21 @@ +# Google Cloud Storage (GCS) + +## Credentials + +There are two methods for setting GCS credentials: +1. Through the Project => Cloud Storage settings in the Label Studio user interface. +2. Through Google Application Default Credentials (ADC). This involves the following steps: + + 2.1. Leave the Google Application Credentials field in the Label Studio UI blank. + + 2.2. Set an environment variable which will apply to all Cloud Storages. This can be done using the following command: + ```bash + export GOOGLE_APPLICATION_CREDENTIALS=google_credentials.json + ``` + 2.3. Alternatively, use the following command: + ```bash + gcloud auth application-default login + ``` + 2.4. Another option is to use credentials provided by the Google App Engine or Google Compute Engine metadata server, if the code is running on either GAE or GCE. + +Note: If Cloud Storage credentials are set in the Label Studio UI, these will take precedence over other methods. \ No newline at end of file From f43c2ca8b089d24223289c4370a29d1c0bbd14fe Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 23 Aug 2025 00:47:28 +0000 Subject: [PATCH 02/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17169012251 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3dea9fa02250..29ab1f60973c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "a80479402d230f5e097a3052f4fe39647e05250a.zip", hash = "sha256:b3e014d60804be801ff4982322e1e784233d75b391f53cfa45e142807c1188ff"}, + {file = "4af783a28749d0668b372f7bb80be627b2d48597.zip", hash = "sha256:1dd09454275d1bfc7c2a852a912a2fc38fab6938b5aa5c0a0e607e2b255e5830"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/a80479402d230f5e097a3052f4fe39647e05250a.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/4af783a28749d0668b372f7bb80be627b2d48597.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5037,4 +5037,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "acbf73d41870732a03b40dd25d5bc7ae486b2af716a9b8deea0c63a41982aa39" +content-hash = "9a85c8c75d3787e65bb0955f56a1b5569a4f089436789d0a7769bf296c6d316a" diff --git a/pyproject.toml b/pyproject.toml index 02ec77401317..b22064cedcec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ dependencies = [ "djangorestframework-simplejwt[crypto] (>=5.4.0,<6.0.0)", "tldextract (>=5.1.3)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/a80479402d230f5e097a3052f4fe39647e05250a.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/4af783a28749d0668b372f7bb80be627b2d48597.zip", ## HumanSignal repo dependencies :end ] From 6f9027078c511abbc94323ba2243764c760f2eb3 Mon Sep 17 00:00:00 2001 From: makseq Date: Mon, 8 Sep 2025 20:26:58 +0300 Subject: [PATCH 03/26] Updates in rules --- .cursor/rules/storage-provider.mdc | 35 +++--- label_stream.md | 186 +++++++++++++++++++++++++++++ review_stream.md | 137 +++++++++++++++++++++ 3 files changed, 343 insertions(+), 15 deletions(-) create mode 100644 label_stream.md create mode 100644 review_stream.md diff --git a/.cursor/rules/storage-provider.mdc b/.cursor/rules/storage-provider.mdc index 5b569569b04a..31f97be508ae 100644 --- a/.cursor/rules/storage-provider.mdc +++ b/.cursor/rules/storage-provider.mdc @@ -23,7 +23,7 @@ Follow all steps below to implement a new storage. More details follow after the 2. [ ] Search official documentation for the new storage you want to add - [ ] Determine whether pre-signed URLs are supported, or whether only direct reads are possible - [ ] Determine whether writes are supported, and how annotations will be stored (objects/blobs, files, rows/strings in a table, etc.) - - [ ] Understand the provider's Python API/SDK, especially how to read, write, and list objects + - [ ] Understand the provider's Python API/SDK, especially how to read, write, and list objects. If SDK is available, use SDK 3. If the requester hasn't specified the target edition, recommend Open Source or Enterprise and confirm the choice 4. Check storage directory structure in `label_studio/io_storages` (or `label_studio_enterprise/lse_io_storages` for Enterprise) and the `s3` (or `s3s` for Enterprise) subfolder 5. [ ] Create the new provider directory structure based on the pattern you observed @@ -32,35 +32,36 @@ Follow all steps below to implement a new storage. More details follow after the 8. [ ] Add a section on how to configure the storage from scratch for users unfamiliar with it. Provide clear, correct, up-to-date steps with links to official docs to reduce manual QA time ### 2. Backend Implementation -4. [ ] Implement storage mixin with common fields: +1. [ ] Implement storage mixin with common fields: - [ ] Basic fields: bucket, prefix, regex_filter, use_blob_urls (pre-signed URLs on/off) - [ ] URL resolution: presign, presign_ttl (if applicable to the storage) - [ ] Provider credentials: api_key, secret_key, endpoint_url - [ ] Common methods: get_client(), validate_connection() -5. [ ] Create import storage base class with required methods: +2. [ ] Create import storage base class with required methods: - [ ] `iter_objects()` - iterate over storage objects - [ ] `get_data()` - load task data from objects - [ ] `generate_http_url()` - create HTTP URLs - [ ] `can_resolve_url()` - check URL resolution capability - [ ] `validate_connection()` - validate credentials and that the prefix contains files -6. [ ] Create export storage class with required methods: +3. [ ] Create export storage class with required methods: - [ ] `save_annotation()` - save single annotation to storage - [ ] `delete_annotation()` - delete annotation from storage (optional) - [ ] `validate_connection()` - validate credentials and bucket access (NO prefix check) -7. [ ] Create non-abstract provider-specific concrete classes for import and export -8. [ ] Implement storage link models: +4. [ ] Create non-abstract provider-specific concrete classes for import and export +5. [ ] Implement storage link models: - [ ] ImportStorageLink for tracking task imports - [ ] ExportStorageLink for tracking annotation exports -9. [ ] **CRITICAL: Add `app_label = 'io_storages'` to Meta classes** - All concrete storage models (ImportStorage, ExportStorage, and StorageLink classes) must include `app_label = 'io_storages'` in their Meta class to avoid Django app registration errors. This is required because storage providers are in subdirectories of `io_storages` but need to be registered under the main `io_storages` app. **Note**: Enterprise providers do NOT need app_label - see enterprise guide. -10. [ ] Create serializers with validation logic -11. [ ] Implement API views following existing patterns -12. [ ] Register URLs in storage URL configuration -13. [ ] Add signal handlers for auto-export functionality: +6. [ ] **CRITICAL: Add `app_label = 'io_storages'` to Meta classes** - All concrete storage models (ImportStorage, ExportStorage, and StorageLink classes) must include `app_label = 'io_storages'` in their Meta class to avoid Django app registration errors. This is required because storage providers are in subdirectories of `io_storages` but need to be registered under the main `io_storages` app. **Note**: Enterprise providers do NOT need app_label - see enterprise guide. +7. [ ] Create serializers with validation logic +8. [ ] Implement API views following existing patterns +9. [ ] Register URLs in storage URL configuration +10. [ ] Add signal handlers for auto-export functionality: - [ ] post_save signal for automatic annotation export - [ ] pre_delete signal for automatic annotation deletion - [ ] Async export functions with error handling -14. [ ] Create database migrations -15. [ ] Add basic pytests for newly added API calls +11. [ ] If you use SDK: add provider SDK library to pyproject.toml + - [ ] Make poetry lock: `poetry install && poetry lock` +12. [ ] Create database migrations using `poetry run python manage.py makemigrations` only! ### 3. Frontend Implementation 1. [ ] Check examples: for Open Source see: `label-studio/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/`, for Enterprise see: `label-studio-enterprise/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/` @@ -75,17 +76,21 @@ Follow all steps below to implement a new storage. More details follow after the 6. [ ] Verify edit mode behavior for credentials ### 4. Testing -- [ ] Write backend unit tests (see @backend-unit-tests.mdc for details) +- [ ] Write backend pytests for newly added API calls (see @backend-unit-tests.mdc for details) - [ ] Test connection validation (validate_connection) - [ ] Test object iteration and filtering (iter_objects) - [ ] Test task data loading (get_data) - [ ] Test frontend form functionality +- [ ] Critical: run all created tests, check how to run them in @backend-unit-tests.mdc ### 5. Documentation - [ ] Add provider to storage documentation (docs/source/guide/storage.md) - [ ] Update API documentation using `@method_decorator` for storage API classes (see @updating-label-studio-sdk.mdc) -### 6. Integration & Deployment +### 6. Git +- [ ] Commit all added and modified files related to the new storage into git + +### 7. Integration & Deployment These steps are for manual QA by the requester; remind them after you finish your work: - [ ] Test end-to-end storage workflow - [ ] Create a project, add a new import storage, sync it, and check Data Manager for new files diff --git a/label_stream.md b/label_stream.md new file mode 100644 index 000000000000..1b26e106cd88 --- /dev/null +++ b/label_stream.md @@ -0,0 +1,186 @@ +# Label Stream: Task Selection, Queues, and Sampling + +## Overview + +This document explains how Label Studio selects the next task for an annotator when they enter the Label Stream ("Label All Tasks"). It consolidates behavior from `label_studio/projects/functions/next_task.py` and relevant project settings described in docs. It covers: + +- Inputs and task/annotation states +- Queueing and prioritization rules +- Sampling strategies and fallbacks +- Skip/postpone flows +- Locking and concurrency +- Enterprise-specific agreement threshold behavior +- Feature flags that alter behavior + +## Key Concepts and Entities + +- **Task**: Unit of work with attributes such as `is_labeled`, `overlap`, predictions, timestamps, etc. +- **Annotation**: User submission for a task (may be cancelled/skipped). Multiple annotations per task for overlap. +- **Prepared tasks**: The base `QuerySet` of tasks eligible for the current session/filters (from Data Manager and/or assignments). +- **Assigned flag**: Indicates manual assignment; when set, assigned tasks take precedence over general queues. +- **DM queue**: Data Manager ordering is respected (e.g., user clicked "Label All Tasks" with a specific sort/filter selection). +- **Sampling**: Strategy to choose the next task among eligible ones: `SEQUENCE`, `UNCERTAINTY`, `UNIFORM`. +- **Locks**: Short-lived per-user locks to prevent concurrent selection of the same task. + +## Inputs to the Label Stream Engine + +- `user`: Current annotator. +- `project`: Current project (includes settings, sampling mode, overlap rules, feature flags, enterprise fields). +- `prepared_tasks`: QuerySet pre-filtered by Data Manager selection and visibility permissions. +- `dm_queue`: Whether Data Manager queue ordering is actively used. +- `assigned_flag`: Whether the annotator has manual assignments that must be served first. + +## Task and Annotation States + +- Task + - `is_labeled`: True if the task is considered complete for labeling (reaches required overlap or business completion criteria). + - `overlap`: Required number of annotations for completion (>= 1). + - Predictions: `predictions.model_version`, `predictions.cluster`, `predictions.score` (for uncertainty sampling). + - Locks: Labeling locks held per user. + - Flags: Included via settings such as `show_overlap_first`, `show_ground_truth_first`. +- Annotation + - `completed_by`: Author. + - `was_cancelled`: True for skipped. + - Drafts: may be `was_postponed`. + +## High-Level Flow + +```mermaid +flowchart TD + A[Start get_next_task] --> B[Compute not_solved_tasks] + B -->|assigned_flag True| C[Use assigned tasks (first)] + C --> L[Return task (no lock set)] + B -->|assigned_flag False| D[Check existing lock for user] + D -->|Lock exists| L + D --> E{Prioritized on low agreement?} + E -->|Yes| F[Pick first unlocked from low-agreement-ordered tasks] + E -->|No| G{show_ground_truth_first?} + F --> H + G -->|Yes| I[Try ground-truth tasks] + G -->|No| H{maximum_annotations > 1?} + I --> H + H -->|Yes| J[Try breadth-first (tasks with max annotations)] + H -->|No| K{Feature flag overlap-first routing?} + J --> K + K -->|New overlap-first| O[Filter to overlap>1 then sample] + K -->|Legacy/Disabled| P[Maybe pre-filter overlap>1 earlier] + O --> Q[Sampling by project.sampling] + P --> Q + Q --> R{dm_queue?} + R -->|Yes and none yet| S[Use DM ordering: first()] + R -->|No or already chosen| T[Postponed draft queue] + S --> T + T --> U[Skipped queue] + U --> V{Have task?} + V -->|Yes and lock needed| W[Set lock for user] + V -->|No| X[Return None] + W --> Y[Record stream history] + L --> Y + Y --> Z[Finish] +``` + +## Detailed Steps and Rules + +1. Build `not_solved_tasks` + - Start from `prepared_tasks` and exclude tasks already annotated by the user. + - Exclude the user’s postponed drafts when applicable. + - Enterprise (if feature flag and settings enabled): include tasks already labeled but with agreement below a threshold; otherwise filter to `is_labeled=False`. + - Optionally pre-filter to `overlap>1` first when configured and not already prioritized on low agreement. + +2. Early exits and locks + - If `assigned_flag` is set: return the first assigned task without setting a lock (manual queue). + - If the user already holds a task lock within `not_solved_tasks`: return it without setting a new lock. + +3. Priority queues before sampling + - Low agreement queue (Enterprise): if prioritized, pick the first unlocked task. + - Ground truth queue: if enabled, prefer tasks with ground-truth annotations. + - Breadth-first queue: when `maximum_annotations>1`, prefer tasks with the highest existing annotation count (finishing in-progress tasks sooner). + +4. Overlap-first routing (two modes) + - Legacy (pre-flag): pre-filter `overlap>1` in the `not_solved_tasks` stage. + - New (flagged): construct the overlap>1 subset and perform sampling within that subset. If none found, fall back to the full set. + +5. Sampling strategies + - `SEQUENCE`: first unlocked task by ordering. + - `UNCERTAINTY`: + - Consider tasks with predictions matching current `project.model_version`. + - Cluster-aware de-biasing: score tasks by how many tasks the user already solved in the same cluster; prefer less-solved clusters and lower `predictions.score`. + - Randomize among the top N to reduce annotator collisions when many annotators are online. + - Fallback to random uniform sampling if no current predictions. + - `UNIFORM`: random unlocked within the candidate set. + +6. Postponed and skipped queues + - Postponed drafts: if the user has postponed drafts for this project, present them first; suppress postpone option for the returned task. + - Skipped (Requeue-for-me): if project skip policy is `REQUEUE_FOR_ME`, return previously skipped tasks by this user in FIFO order. + +7. Locking and finish + - If a task was selected via queues/sampling and a lock is needed, set a lock with TTL proportional to the average lead time. + - Append stream history for analytics/debugging; return task and a human-readable `queue_info` string indicating which path was used. + +## Enterprise Agreement Threshold Behavior + +When the Enterprise feature flag and project’s LSE fields are enabled: + +- The engine can include tasks that are already labeled but whose agreement is below `agreement_threshold`. +- It also caps the number of additional annotators per task via `max_additional_annotators_assignable` to avoid infinite reassignment. +- Low-agreement tasks can be prioritized by ordering the candidate set by `-is_labeled` then ascending agreement (labeled-low-agreement first), and sampling within this ordered sequence. + +## Skip/Postpone Semantics + +- Skip produces a cancelled annotation. If `SkipQueue.REQUEUE_FOR_ME` is configured, the task returns to the user later. +- Postpone records a draft flagged as postponed; postponed drafts are elevated in priority on re-entry into the stream. + +## Concurrency and Locks + +- Locks are set via `Task.set_lock(user)` and queried via `Task.get_locked_by(user)` and `task.has_lock(user)`. +- Database `select_for_update(skip_locked=True)` is used to reduce collisions when probing candidate tasks. + +```mermaid +sequenceDiagram + participant U as User + participant LS as Label Stream Engine + participant DB as DB + U->>LS: Request next task + LS->>DB: Probe candidate ids (ordered/randomized) + loop Until unlocked + LS->>DB: select_for_update(skip_locked) + alt Task unlocked & no user lock + LS->>DB: set_lock(user) + break + else Task locked + LS->>DB: try next id + end + end + LS-->>U: Return task + queue_info +``` + +## Feature Flags That Affect Behavior (non-exhaustive) + +- `fflag_fix_back_lsdv_4523_show_overlap_first_order_27022023_short`: Enables the newer overlap-first routing. +- `fflag_feat_optic_161_project_settings_for_low_agreement_threshold_score_short`: Enables low-agreement prioritization under LSE. +- `fflag_feat_all_leap_1825_annotator_evaluation_short`: Onboarding mode nuance for `is_labeled` filtering. +- `fflag_fix_back_dev_4185_next_task_additional_logging_long`: Adds verbose debug logging. + +## Settings That Influence Flow + +- Project-level + - `sampling`: `SEQUENCE` | `UNCERTAINTY` | `UNIFORM` + - `maximum_annotations`: upper bound for concurrency and breadth-first behavior + - `show_overlap_first`, `show_ground_truth_first` + - Skip policy: `SkipQueue.REQUEUE_FOR_ME` +- Enterprise project-level + - `agreement_threshold` + - `max_additional_annotators_assignable` + +## Edge Cases and Fallbacks + +- If no predictions exist for `UNCERTAINTY`, fallback to uniform random. +- If all candidates are locked, no task is returned; the client should retry. +- If manual assignments exist and `assigned_flag` is set, the assigned-first path is used. + +## Notes for Redesign + +- The current flow is a blend of priority queues (manual, ground-truth, low-agreement, breadth-first), a conditional overlap-first sub-pipeline, and a configurable sampler. This can be modeled as a modular pipeline with pluggable stages for filtering, prioritization, and selection, governed by project policy and feature flags. + + + diff --git a/review_stream.md b/review_stream.md new file mode 100644 index 000000000000..ec85d18df9dd --- /dev/null +++ b/review_stream.md @@ -0,0 +1,137 @@ +# Review Stream: Annotation Review Task Selection + +## Overview + +This document consolidates the reviewer flow based on `label_studio_enterprise/reviews/views/api.py` (notably `ReviewNextTaskAPI`) and related settings. It describes how the system selects the next annotation for review, including: + +- Inputs and project review settings +- Manual vs auto-assigned review +- Selection criteria depending on review policy +- Ordering and limits +- Reject-updated prioritization +- Locking to prevent collisions + +## Key Concepts and Entities + +- **Annotation Review**: Accept/Reject (plus optional comments) for a specific annotation. +- **Review criteria**: One of the policy values defined in `ReviewSettings`: + - `REVIEW_CRITERIA_ONE`: Mark task reviewed after at least one accepted annotation. + - `REVIEW_CRITERIA_ALL`: Mark task reviewed after every annotation in the task is processed. +- **Manual assignment**: Annotations explicitly assigned to a reviewer via assignments of type `REVIEW`. +- **Auto-assignment**: The system selects eligible annotations to review. +- **Only finished tasks**: Option to restrict review to tasks that are labeled/complete for labeling. +- **Updated after reject**: Prioritize annotations that have been updated following a rejection. + +## Inputs to the Review Engine + +- `user`: Current reviewer. +- `project`: Current project and its `review_settings`. +- `request.data`: May include `selectedItems`, indicating a Data Manager subset and ordering. +- Feature flags: Several flags toggle optimized queries and ordering behaviors. + +## High-Level Flow + +```mermaid +flowchart TD + A[Start ReviewNextTask] --> B[Collect candidate tasks/annotations] + B --> C{Manual assignments exist?} + C -->|Yes| D[Build manual annotations subset] + C -->|No| E{Review only manual?} + E -->|Yes| F[Stop: None] + E -->|No| G[Auto-assigned path] + D --> H[Apply criteria & limits] + G --> I[Updated-after-reject priority] + I -->|Found| J[Return prioritized annotation] + I -->|Not Found| K[Filter by only_finished_tasks] + K --> L[Apply review criteria] + L --> H + H --> M[Order, limit, then lock one] + M --> N[Return task + annotation] +``` + +## Detailed Steps and Rules + +1. Determine base scope + - If `selectedItems` are provided, use `get_prepared_queryset` to build task scope and subset of annotations in that scope. + - Otherwise, default to the project-wide task set (filtered by permissions and optionally by `only_finished_tasks`). + +2. Manual vs. Auto + - If reviewer has manual assignments and manual subset has eligible annotations, take the manual path first. + - If there are no manual eligible annotations and `review_only_manual_assignments` is enabled, stop with "No more annotations". + - Otherwise proceed with the auto path. + +3. Updated-after-reject priority (Enterprise) + - When enabled by feature flag, prefer annotations that were updated following a reject. This provides fast feedback loops on corrections. + +4. Only finished tasks + - If `only_finished_tasks` is enabled, restrict the scope to tasks that are labeling-complete (`is_labeled=True`). + - Enterprise agreement threshold (optimized path): Optionally limit tasks to those meeting agreement thresholds or sufficient annotator counts. + +5. Apply review criteria + - `REVIEW_CRITERIA_ONE`: + - Exclude tasks that already have an accepted review for any of their annotations. + - Consider remaining annotations for these tasks. + - `REVIEW_CRITERIA_ALL`: + - Include tasks that still have at least one annotation with no review. + - Exclude annotations already reviewed by this reviewer. + +6. Ordering and limits + - Order by task and annotation id (or by updated_at when a flag is off), and cap by `REVIEW_TASK_LIMIT` and `REVIEW_ANNOTATION_LIMIT` to reduce query load. + - Use `.only('task','id')` to optimize fetching. + +7. Locking + +```mermaid +sequenceDiagram + participant R as Reviewer + participant RS as Review Engine + participant DB as DB + R->>RS: Request next review annotation + RS->>DB: Build limited candidate set + loop iterate candidates + RS->>DB: select_for_update().only('id') on Task + alt Task already has review lock by R + RS->>DB: skip and continue + else Task available + RS->>DB: set_review_lock(R) + break + end + end + RS-->>R: Return task + annotation + metadata +``` + +## Enterprise Agreement Threshold Behavior (Auto path) + +When enabled and using the optimized flow: + +- If `only_finished_tasks` is set, candidate tasks can be filtered by: + - `agreement >= agreement_threshold` and `is_labeled = True`, OR + - `annotators >= overlap + max_additional_annotators_assignable` (to avoid repeatedly queueing tasks for more reviews). +- After filtering tasks, annotations are selected/ordered from that reduced set. + +## Feature Flags (non-exhaustive) + +- `ff_back_DEV_3374_review_query_160922_short`: Enables optimized review queries. +- `fflag_fix_back_dev_3668_review_stream_optimizaion_short`: Enables further optimized path (`new_review_optimized`). +- `ff_back_DEV_1711_review_queue_140222_short`: Enables updated-after-reject prioritization. +- `fflag_feat_all_leap_1081_reviewer_flow_updates`: Adjusts ordering to be strictly by task, id. + +## Settings That Influence Flow + +- Project `review_settings`: + - `review_criteria`: `ONE` or `ALL`. + - `only_finished_tasks`: bool. + - `review_only_manual_assignments`: bool. +- Assignments: Existence of manual assignments shifts priority. + +## Edge Cases and Fallbacks + +- If all candidate tasks are locked, return None and the client should retry. +- If manual assignments exist but none are eligible under the chosen criteria, auto path is used (unless manual-only is set). + +## Notes for Redesign + +- The review flow is a pipeline: scope -> manual-or-auto -> prioritization (updated-after-reject) -> policy filter (criteria) -> ordering/limits -> lock. This structure is amenable to a configurable pipeline with reusable stages. + + + From ff564c90210ff36bf8f07b6972b87c30b8962dcb Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 00:16:21 +0000 Subject: [PATCH 04/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17688845812 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index e95ffcd71ba0..7458992f1772 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "4af783a28749d0668b372f7bb80be627b2d48597.zip", hash = "sha256:1dd09454275d1bfc7c2a852a912a2fc38fab6938b5aa5c0a0e607e2b255e5830"}, + {file = "e7304fafd65bbff1c289cae5d90060ff458e4952.zip", hash = "sha256:ef534d803cb023fafc635249a5b0af3da730d89b9ba752c9ecde283fec15a25c"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/4af783a28749d0668b372f7bb80be627b2d48597.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/e7304fafd65bbff1c289cae5d90060ff458e4952.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "9a85c8c75d3787e65bb0955f56a1b5569a4f089436789d0a7769bf296c6d316a" +content-hash = "dbf9f9c5b32c6b04b8d915a094cad46ceaedf3d1e554593b62e3aa99c5c626b9" diff --git a/pyproject.toml b/pyproject.toml index 11c529818490..f3beb159bbb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/4af783a28749d0668b372f7bb80be627b2d48597.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/e7304fafd65bbff1c289cae5d90060ff458e4952.zip", ## HumanSignal repo dependencies :end ] From 653827cf9b079a24936a4102809b396d6c956f5f Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 00:22:41 +0000 Subject: [PATCH 05/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17688941514 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7458992f1772..cbac0b778d38 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "e7304fafd65bbff1c289cae5d90060ff458e4952.zip", hash = "sha256:ef534d803cb023fafc635249a5b0af3da730d89b9ba752c9ecde283fec15a25c"}, + {file = "666808c309f04870bd65dbb9b541e2387574e59e.zip", hash = "sha256:d59a7c377f27f884861f92ae6132364a25638862f42057fc162bb84ae1968b19"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/e7304fafd65bbff1c289cae5d90060ff458e4952.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/666808c309f04870bd65dbb9b541e2387574e59e.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "dbf9f9c5b32c6b04b8d915a094cad46ceaedf3d1e554593b62e3aa99c5c626b9" +content-hash = "f49ab12f7c5d9ee4bc084f41fc38699b2b7e28836d727435c663baa0a5b6cec2" diff --git a/pyproject.toml b/pyproject.toml index f3beb159bbb0..dccf495a8286 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/e7304fafd65bbff1c289cae5d90060ff458e4952.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/666808c309f04870bd65dbb9b541e2387574e59e.zip", ## HumanSignal repo dependencies :end ] From d2d26bc85519c0c3e9d044bc91c406041d34fc5d Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 01:33:33 +0000 Subject: [PATCH 06/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17689834997 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index cbac0b778d38..e545a3a51cd2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "666808c309f04870bd65dbb9b541e2387574e59e.zip", hash = "sha256:d59a7c377f27f884861f92ae6132364a25638862f42057fc162bb84ae1968b19"}, + {file = "d6889a8596e8345bc2adb8c98094aa53963a6237.zip", hash = "sha256:d822039e379040c1e367d1c03ccab27cc5e078ab9d7cd9ad080cd192c8fe3d50"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/666808c309f04870bd65dbb9b541e2387574e59e.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/d6889a8596e8345bc2adb8c98094aa53963a6237.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "f49ab12f7c5d9ee4bc084f41fc38699b2b7e28836d727435c663baa0a5b6cec2" +content-hash = "4d70e4863d87b343c63caa25146fd8c426c5d2cab44b6b692d892769a7da33c2" diff --git a/pyproject.toml b/pyproject.toml index dccf495a8286..e429ce462cb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/666808c309f04870bd65dbb9b541e2387574e59e.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/d6889a8596e8345bc2adb8c98094aa53963a6237.zip", ## HumanSignal repo dependencies :end ] From f5689e4cad3eb8a68ce9b497c3adaa253894b2ed Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 02:56:32 +0000 Subject: [PATCH 07/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17690766570 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index e545a3a51cd2..675e9567f6e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "d6889a8596e8345bc2adb8c98094aa53963a6237.zip", hash = "sha256:d822039e379040c1e367d1c03ccab27cc5e078ab9d7cd9ad080cd192c8fe3d50"}, + {file = "50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip", hash = "sha256:9e9aa51306a91b3f7399d0ea5b7760c156ec88a6e563c8c99850dc2770c83e0c"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/d6889a8596e8345bc2adb8c98094aa53963a6237.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "4d70e4863d87b343c63caa25146fd8c426c5d2cab44b6b692d892769a7da33c2" +content-hash = "751cc4ee486f68e525dd4214345ae468d4af455290e3a570e63897c616d8c6ca" diff --git a/pyproject.toml b/pyproject.toml index e429ce462cb0..43bbe0d739be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/d6889a8596e8345bc2adb8c98094aa53963a6237.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip", ## HumanSignal repo dependencies :end ] From eea9de7f74bc70f25409b2d11a6b1f971abf116d Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 14:24:23 +0000 Subject: [PATCH 08/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17697804120 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 675e9567f6e0..70bc5b2c9917 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip", hash = "sha256:9e9aa51306a91b3f7399d0ea5b7760c156ec88a6e563c8c99850dc2770c83e0c"}, + {file = "ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip", hash = "sha256:645af89872d9caaed9a8bdf52667af5cbb34a47357265eee5e386e01b853fd4d"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "751cc4ee486f68e525dd4214345ae468d4af455290e3a570e63897c616d8c6ca" +content-hash = "819a71f383518e701bc5d90344b1b7cb7c5a0525df54094fea79260109dd27e1" diff --git a/pyproject.toml b/pyproject.toml index 43bbe0d739be..8f32446786b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/50fe64307c080d4f70daca2d4d64467d8a4b3b57.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip", ## HumanSignal repo dependencies :end ] From 6505c2684c45a11ba713d8b879d8d02fce5a5947 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sat, 13 Sep 2025 14:34:27 +0000 Subject: [PATCH 09/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17697918188 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 70bc5b2c9917..fb033e7d7693 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip", hash = "sha256:645af89872d9caaed9a8bdf52667af5cbb34a47357265eee5e386e01b853fd4d"}, + {file = "c08c1db419731bf8fedea87cda575c23a9f764a2.zip", hash = "sha256:6da563dcf37fb71c92f6043a035b22d6b24d816a1cfffa16e9a7b64ef5ea4b24"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/c08c1db419731bf8fedea87cda575c23a9f764a2.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "819a71f383518e701bc5d90344b1b7cb7c5a0525df54094fea79260109dd27e1" +content-hash = "c152107325824bc9bd40076de20a31c0fa07161efa4f743c626fb75fb6eedc31" diff --git a/pyproject.toml b/pyproject.toml index 8f32446786b1..2386eb93732f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/ab74e9f5f179c48f05c580fb1ca2da1a587addd5.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/c08c1db419731bf8fedea87cda575c23a9f764a2.zip", ## HumanSignal repo dependencies :end ] From f1f96d4adfc5b231847602584682e8bca4e338c6 Mon Sep 17 00:00:00 2001 From: Max Tkachenko Date: Sat, 13 Sep 2025 18:09:15 +0300 Subject: [PATCH 10/26] Delete label_studio/io_storages/gcs/README.md --- label_studio/io_storages/gcs/README.md | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 label_studio/io_storages/gcs/README.md diff --git a/label_studio/io_storages/gcs/README.md b/label_studio/io_storages/gcs/README.md deleted file mode 100644 index 48a947212a22..000000000000 --- a/label_studio/io_storages/gcs/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Google Cloud Storage (GCS) - -## Credentials - -There are two methods for setting GCS credentials: -1. Through the Project => Cloud Storage settings in the Label Studio user interface. -2. Through Google Application Default Credentials (ADC). This involves the following steps: - - 2.1. Leave the Google Application Credentials field in the Label Studio UI blank. - - 2.2. Set an environment variable which will apply to all Cloud Storages. This can be done using the following command: - ```bash - export GOOGLE_APPLICATION_CREDENTIALS=google_credentials.json - ``` - 2.3. Alternatively, use the following command: - ```bash - gcloud auth application-default login - ``` - 2.4. Another option is to use credentials provided by the Google App Engine or Google Compute Engine metadata server, if the code is running on either GAE or GCE. - -Note: If Cloud Storage credentials are set in the Label Studio UI, these will take precedence over other methods. \ No newline at end of file From 84c58d2cab4cf78254510590b53a41f2c3e623de Mon Sep 17 00:00:00 2001 From: Max Tkachenko Date: Sat, 13 Sep 2025 18:09:43 +0300 Subject: [PATCH 11/26] Delete label_stream.md --- label_stream.md | 186 ------------------------------------------------ 1 file changed, 186 deletions(-) delete mode 100644 label_stream.md diff --git a/label_stream.md b/label_stream.md deleted file mode 100644 index 1b26e106cd88..000000000000 --- a/label_stream.md +++ /dev/null @@ -1,186 +0,0 @@ -# Label Stream: Task Selection, Queues, and Sampling - -## Overview - -This document explains how Label Studio selects the next task for an annotator when they enter the Label Stream ("Label All Tasks"). It consolidates behavior from `label_studio/projects/functions/next_task.py` and relevant project settings described in docs. It covers: - -- Inputs and task/annotation states -- Queueing and prioritization rules -- Sampling strategies and fallbacks -- Skip/postpone flows -- Locking and concurrency -- Enterprise-specific agreement threshold behavior -- Feature flags that alter behavior - -## Key Concepts and Entities - -- **Task**: Unit of work with attributes such as `is_labeled`, `overlap`, predictions, timestamps, etc. -- **Annotation**: User submission for a task (may be cancelled/skipped). Multiple annotations per task for overlap. -- **Prepared tasks**: The base `QuerySet` of tasks eligible for the current session/filters (from Data Manager and/or assignments). -- **Assigned flag**: Indicates manual assignment; when set, assigned tasks take precedence over general queues. -- **DM queue**: Data Manager ordering is respected (e.g., user clicked "Label All Tasks" with a specific sort/filter selection). -- **Sampling**: Strategy to choose the next task among eligible ones: `SEQUENCE`, `UNCERTAINTY`, `UNIFORM`. -- **Locks**: Short-lived per-user locks to prevent concurrent selection of the same task. - -## Inputs to the Label Stream Engine - -- `user`: Current annotator. -- `project`: Current project (includes settings, sampling mode, overlap rules, feature flags, enterprise fields). -- `prepared_tasks`: QuerySet pre-filtered by Data Manager selection and visibility permissions. -- `dm_queue`: Whether Data Manager queue ordering is actively used. -- `assigned_flag`: Whether the annotator has manual assignments that must be served first. - -## Task and Annotation States - -- Task - - `is_labeled`: True if the task is considered complete for labeling (reaches required overlap or business completion criteria). - - `overlap`: Required number of annotations for completion (>= 1). - - Predictions: `predictions.model_version`, `predictions.cluster`, `predictions.score` (for uncertainty sampling). - - Locks: Labeling locks held per user. - - Flags: Included via settings such as `show_overlap_first`, `show_ground_truth_first`. -- Annotation - - `completed_by`: Author. - - `was_cancelled`: True for skipped. - - Drafts: may be `was_postponed`. - -## High-Level Flow - -```mermaid -flowchart TD - A[Start get_next_task] --> B[Compute not_solved_tasks] - B -->|assigned_flag True| C[Use assigned tasks (first)] - C --> L[Return task (no lock set)] - B -->|assigned_flag False| D[Check existing lock for user] - D -->|Lock exists| L - D --> E{Prioritized on low agreement?} - E -->|Yes| F[Pick first unlocked from low-agreement-ordered tasks] - E -->|No| G{show_ground_truth_first?} - F --> H - G -->|Yes| I[Try ground-truth tasks] - G -->|No| H{maximum_annotations > 1?} - I --> H - H -->|Yes| J[Try breadth-first (tasks with max annotations)] - H -->|No| K{Feature flag overlap-first routing?} - J --> K - K -->|New overlap-first| O[Filter to overlap>1 then sample] - K -->|Legacy/Disabled| P[Maybe pre-filter overlap>1 earlier] - O --> Q[Sampling by project.sampling] - P --> Q - Q --> R{dm_queue?} - R -->|Yes and none yet| S[Use DM ordering: first()] - R -->|No or already chosen| T[Postponed draft queue] - S --> T - T --> U[Skipped queue] - U --> V{Have task?} - V -->|Yes and lock needed| W[Set lock for user] - V -->|No| X[Return None] - W --> Y[Record stream history] - L --> Y - Y --> Z[Finish] -``` - -## Detailed Steps and Rules - -1. Build `not_solved_tasks` - - Start from `prepared_tasks` and exclude tasks already annotated by the user. - - Exclude the user’s postponed drafts when applicable. - - Enterprise (if feature flag and settings enabled): include tasks already labeled but with agreement below a threshold; otherwise filter to `is_labeled=False`. - - Optionally pre-filter to `overlap>1` first when configured and not already prioritized on low agreement. - -2. Early exits and locks - - If `assigned_flag` is set: return the first assigned task without setting a lock (manual queue). - - If the user already holds a task lock within `not_solved_tasks`: return it without setting a new lock. - -3. Priority queues before sampling - - Low agreement queue (Enterprise): if prioritized, pick the first unlocked task. - - Ground truth queue: if enabled, prefer tasks with ground-truth annotations. - - Breadth-first queue: when `maximum_annotations>1`, prefer tasks with the highest existing annotation count (finishing in-progress tasks sooner). - -4. Overlap-first routing (two modes) - - Legacy (pre-flag): pre-filter `overlap>1` in the `not_solved_tasks` stage. - - New (flagged): construct the overlap>1 subset and perform sampling within that subset. If none found, fall back to the full set. - -5. Sampling strategies - - `SEQUENCE`: first unlocked task by ordering. - - `UNCERTAINTY`: - - Consider tasks with predictions matching current `project.model_version`. - - Cluster-aware de-biasing: score tasks by how many tasks the user already solved in the same cluster; prefer less-solved clusters and lower `predictions.score`. - - Randomize among the top N to reduce annotator collisions when many annotators are online. - - Fallback to random uniform sampling if no current predictions. - - `UNIFORM`: random unlocked within the candidate set. - -6. Postponed and skipped queues - - Postponed drafts: if the user has postponed drafts for this project, present them first; suppress postpone option for the returned task. - - Skipped (Requeue-for-me): if project skip policy is `REQUEUE_FOR_ME`, return previously skipped tasks by this user in FIFO order. - -7. Locking and finish - - If a task was selected via queues/sampling and a lock is needed, set a lock with TTL proportional to the average lead time. - - Append stream history for analytics/debugging; return task and a human-readable `queue_info` string indicating which path was used. - -## Enterprise Agreement Threshold Behavior - -When the Enterprise feature flag and project’s LSE fields are enabled: - -- The engine can include tasks that are already labeled but whose agreement is below `agreement_threshold`. -- It also caps the number of additional annotators per task via `max_additional_annotators_assignable` to avoid infinite reassignment. -- Low-agreement tasks can be prioritized by ordering the candidate set by `-is_labeled` then ascending agreement (labeled-low-agreement first), and sampling within this ordered sequence. - -## Skip/Postpone Semantics - -- Skip produces a cancelled annotation. If `SkipQueue.REQUEUE_FOR_ME` is configured, the task returns to the user later. -- Postpone records a draft flagged as postponed; postponed drafts are elevated in priority on re-entry into the stream. - -## Concurrency and Locks - -- Locks are set via `Task.set_lock(user)` and queried via `Task.get_locked_by(user)` and `task.has_lock(user)`. -- Database `select_for_update(skip_locked=True)` is used to reduce collisions when probing candidate tasks. - -```mermaid -sequenceDiagram - participant U as User - participant LS as Label Stream Engine - participant DB as DB - U->>LS: Request next task - LS->>DB: Probe candidate ids (ordered/randomized) - loop Until unlocked - LS->>DB: select_for_update(skip_locked) - alt Task unlocked & no user lock - LS->>DB: set_lock(user) - break - else Task locked - LS->>DB: try next id - end - end - LS-->>U: Return task + queue_info -``` - -## Feature Flags That Affect Behavior (non-exhaustive) - -- `fflag_fix_back_lsdv_4523_show_overlap_first_order_27022023_short`: Enables the newer overlap-first routing. -- `fflag_feat_optic_161_project_settings_for_low_agreement_threshold_score_short`: Enables low-agreement prioritization under LSE. -- `fflag_feat_all_leap_1825_annotator_evaluation_short`: Onboarding mode nuance for `is_labeled` filtering. -- `fflag_fix_back_dev_4185_next_task_additional_logging_long`: Adds verbose debug logging. - -## Settings That Influence Flow - -- Project-level - - `sampling`: `SEQUENCE` | `UNCERTAINTY` | `UNIFORM` - - `maximum_annotations`: upper bound for concurrency and breadth-first behavior - - `show_overlap_first`, `show_ground_truth_first` - - Skip policy: `SkipQueue.REQUEUE_FOR_ME` -- Enterprise project-level - - `agreement_threshold` - - `max_additional_annotators_assignable` - -## Edge Cases and Fallbacks - -- If no predictions exist for `UNCERTAINTY`, fallback to uniform random. -- If all candidates are locked, no task is returned; the client should retry. -- If manual assignments exist and `assigned_flag` is set, the assigned-first path is used. - -## Notes for Redesign - -- The current flow is a blend of priority queues (manual, ground-truth, low-agreement, breadth-first), a conditional overlap-first sub-pipeline, and a configurable sampler. This can be modeled as a modular pipeline with pluggable stages for filtering, prioritization, and selection, governed by project policy and feature flags. - - - From 85edbfa15de1f850718656cb823bf7173f217102 Mon Sep 17 00:00:00 2001 From: Max Tkachenko Date: Sat, 13 Sep 2025 18:10:04 +0300 Subject: [PATCH 12/26] Delete review_stream.md --- review_stream.md | 137 ----------------------------------------------- 1 file changed, 137 deletions(-) delete mode 100644 review_stream.md diff --git a/review_stream.md b/review_stream.md deleted file mode 100644 index ec85d18df9dd..000000000000 --- a/review_stream.md +++ /dev/null @@ -1,137 +0,0 @@ -# Review Stream: Annotation Review Task Selection - -## Overview - -This document consolidates the reviewer flow based on `label_studio_enterprise/reviews/views/api.py` (notably `ReviewNextTaskAPI`) and related settings. It describes how the system selects the next annotation for review, including: - -- Inputs and project review settings -- Manual vs auto-assigned review -- Selection criteria depending on review policy -- Ordering and limits -- Reject-updated prioritization -- Locking to prevent collisions - -## Key Concepts and Entities - -- **Annotation Review**: Accept/Reject (plus optional comments) for a specific annotation. -- **Review criteria**: One of the policy values defined in `ReviewSettings`: - - `REVIEW_CRITERIA_ONE`: Mark task reviewed after at least one accepted annotation. - - `REVIEW_CRITERIA_ALL`: Mark task reviewed after every annotation in the task is processed. -- **Manual assignment**: Annotations explicitly assigned to a reviewer via assignments of type `REVIEW`. -- **Auto-assignment**: The system selects eligible annotations to review. -- **Only finished tasks**: Option to restrict review to tasks that are labeled/complete for labeling. -- **Updated after reject**: Prioritize annotations that have been updated following a rejection. - -## Inputs to the Review Engine - -- `user`: Current reviewer. -- `project`: Current project and its `review_settings`. -- `request.data`: May include `selectedItems`, indicating a Data Manager subset and ordering. -- Feature flags: Several flags toggle optimized queries and ordering behaviors. - -## High-Level Flow - -```mermaid -flowchart TD - A[Start ReviewNextTask] --> B[Collect candidate tasks/annotations] - B --> C{Manual assignments exist?} - C -->|Yes| D[Build manual annotations subset] - C -->|No| E{Review only manual?} - E -->|Yes| F[Stop: None] - E -->|No| G[Auto-assigned path] - D --> H[Apply criteria & limits] - G --> I[Updated-after-reject priority] - I -->|Found| J[Return prioritized annotation] - I -->|Not Found| K[Filter by only_finished_tasks] - K --> L[Apply review criteria] - L --> H - H --> M[Order, limit, then lock one] - M --> N[Return task + annotation] -``` - -## Detailed Steps and Rules - -1. Determine base scope - - If `selectedItems` are provided, use `get_prepared_queryset` to build task scope and subset of annotations in that scope. - - Otherwise, default to the project-wide task set (filtered by permissions and optionally by `only_finished_tasks`). - -2. Manual vs. Auto - - If reviewer has manual assignments and manual subset has eligible annotations, take the manual path first. - - If there are no manual eligible annotations and `review_only_manual_assignments` is enabled, stop with "No more annotations". - - Otherwise proceed with the auto path. - -3. Updated-after-reject priority (Enterprise) - - When enabled by feature flag, prefer annotations that were updated following a reject. This provides fast feedback loops on corrections. - -4. Only finished tasks - - If `only_finished_tasks` is enabled, restrict the scope to tasks that are labeling-complete (`is_labeled=True`). - - Enterprise agreement threshold (optimized path): Optionally limit tasks to those meeting agreement thresholds or sufficient annotator counts. - -5. Apply review criteria - - `REVIEW_CRITERIA_ONE`: - - Exclude tasks that already have an accepted review for any of their annotations. - - Consider remaining annotations for these tasks. - - `REVIEW_CRITERIA_ALL`: - - Include tasks that still have at least one annotation with no review. - - Exclude annotations already reviewed by this reviewer. - -6. Ordering and limits - - Order by task and annotation id (or by updated_at when a flag is off), and cap by `REVIEW_TASK_LIMIT` and `REVIEW_ANNOTATION_LIMIT` to reduce query load. - - Use `.only('task','id')` to optimize fetching. - -7. Locking - -```mermaid -sequenceDiagram - participant R as Reviewer - participant RS as Review Engine - participant DB as DB - R->>RS: Request next review annotation - RS->>DB: Build limited candidate set - loop iterate candidates - RS->>DB: select_for_update().only('id') on Task - alt Task already has review lock by R - RS->>DB: skip and continue - else Task available - RS->>DB: set_review_lock(R) - break - end - end - RS-->>R: Return task + annotation + metadata -``` - -## Enterprise Agreement Threshold Behavior (Auto path) - -When enabled and using the optimized flow: - -- If `only_finished_tasks` is set, candidate tasks can be filtered by: - - `agreement >= agreement_threshold` and `is_labeled = True`, OR - - `annotators >= overlap + max_additional_annotators_assignable` (to avoid repeatedly queueing tasks for more reviews). -- After filtering tasks, annotations are selected/ordered from that reduced set. - -## Feature Flags (non-exhaustive) - -- `ff_back_DEV_3374_review_query_160922_short`: Enables optimized review queries. -- `fflag_fix_back_dev_3668_review_stream_optimizaion_short`: Enables further optimized path (`new_review_optimized`). -- `ff_back_DEV_1711_review_queue_140222_short`: Enables updated-after-reject prioritization. -- `fflag_feat_all_leap_1081_reviewer_flow_updates`: Adjusts ordering to be strictly by task, id. - -## Settings That Influence Flow - -- Project `review_settings`: - - `review_criteria`: `ONE` or `ALL`. - - `only_finished_tasks`: bool. - - `review_only_manual_assignments`: bool. -- Assignments: Existence of manual assignments shifts priority. - -## Edge Cases and Fallbacks - -- If all candidate tasks are locked, return None and the client should retry. -- If manual assignments exist but none are eligible under the chosen criteria, auto path is used (unless manual-only is set). - -## Notes for Redesign - -- The review flow is a pipeline: scope -> manual-or-auto -> prioritization (updated-after-reject) -> policy filter (criteria) -> ordering/limits -> lock. This structure is amenable to a configurable pipeline with reusable stages. - - - From 2f8bd963246d43d284854ccf7c4d377d0334f704 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 00:15:13 +0100 Subject: [PATCH 13/26] Add icon. Add datetime fix --- .cursor/rules/storage-provider.mdc | 9 ++++++--- label_studio/io_storages/proxy_api.py | 7 ++++++- .../ui/src/assets/icons/cloud-provider-databricks.svg | 1 + web/libs/ui/src/assets/icons/index.ts | 1 + 4 files changed, 14 insertions(+), 4 deletions(-) create mode 100644 web/libs/ui/src/assets/icons/cloud-provider-databricks.svg diff --git a/.cursor/rules/storage-provider.mdc b/.cursor/rules/storage-provider.mdc index 31f97be508ae..193a711b0a8e 100644 --- a/.cursor/rules/storage-provider.mdc +++ b/.cursor/rules/storage-provider.mdc @@ -21,7 +21,7 @@ Follow all steps below to implement a new storage. More details follow after the ### 1. Exploration and preparation 1. [ ] Carefully read @io_storages/README.md 2. [ ] Search official documentation for the new storage you want to add - - [ ] Determine whether pre-signed URLs are supported, or whether only direct reads are possible + - [ ] Determine whether pre-signed URLs are supported, or whether only direct reads are possible. In case of direct reads, we should hide pre-signed URLs toggle and use Label Studio proxy. - [ ] Determine whether writes are supported, and how annotations will be stored (objects/blobs, files, rows/strings in a table, etc.) - [ ] Understand the provider's Python API/SDK, especially how to read, write, and list objects. If SDK is available, use SDK 3. If the requester hasn't specified the target edition, recommend Open Source or Enterprise and confirm the choice @@ -33,7 +33,7 @@ Follow all steps below to implement a new storage. More details follow after the ### 2. Backend Implementation 1. [ ] Implement storage mixin with common fields: - - [ ] Basic fields: bucket, prefix, regex_filter, use_blob_urls (pre-signed URLs on/off) + - [ ] Basic fields: bucket, prefix, regex_filter, use_blob_urls (pre-signed URLs on/off), recursive_scan (if applicable) - [ ] URL resolution: presign, presign_ttl (if applicable to the storage) - [ ] Provider credentials: api_key, secret_key, endpoint_url - [ ] Common methods: get_client(), validate_connection() @@ -62,6 +62,7 @@ Follow all steps below to implement a new storage. More details follow after the 11. [ ] If you use SDK: add provider SDK library to pyproject.toml - [ ] Make poetry lock: `poetry install && poetry lock` 12. [ ] Create database migrations using `poetry run python manage.py makemigrations` only! +13. [ ] Ensure that you correctly handle token and security fields; they should not be displayed on the frontend or backend after they are initially entered and saved. Verify how this works with other storage codes. ### 3. Frontend Implementation 1. [ ] Check examples: for Open Source see: `label-studio/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/`, for Enterprise see: `label-studio-enterprise/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/` @@ -81,6 +82,8 @@ Follow all steps below to implement a new storage. More details follow after the - [ ] Test object iteration and filtering (iter_objects) - [ ] Test task data loading (get_data) - [ ] Test frontend form functionality +- [ ] Test export annotations on Sync button click and when Submit button clicked (post save signal) +- [ ] Test delete exported annotation - [ ] Critical: run all created tests, check how to run them in @backend-unit-tests.mdc ### 5. Documentation @@ -88,7 +91,7 @@ Follow all steps below to implement a new storage. More details follow after the - [ ] Update API documentation using `@method_decorator` for storage API classes (see @updating-label-studio-sdk.mdc) ### 6. Git -- [ ] Commit all added and modified files related to the new storage into git +- [ ] Commit all added and modified files related to the new storage into git, use `git add ` and never use `git commit -a`. ### 7. Integration & Deployment These steps are for manual QA by the requester; remind them after you finish your work: diff --git a/label_studio/io_storages/proxy_api.py b/label_studio/io_storages/proxy_api.py index 0e7b12817204..432ab77dc275 100644 --- a/label_studio/io_storages/proxy_api.py +++ b/label_studio/io_storages/proxy_api.py @@ -199,7 +199,12 @@ def prepare_headers(self, response, metadata, request, project): if metadata.get('ContentRange'): response.headers['Content-Range'] = metadata['ContentRange'] if metadata.get('LastModified'): - response.headers['Last-Modified'] = metadata['LastModified'].strftime('%a, %d %b %Y %H:%M:%S GMT') + last_mod = metadata['LastModified'] + # Accept either datetime-like (has strftime) or preformatted string + if hasattr(last_mod, 'strftime'): + response.headers['Last-Modified'] = last_mod.strftime('%a, %d %b %Y %H:%M:%S GMT') + else: + response.headers['Last-Modified'] = str(last_mod) # Always enable range requests response.headers['Accept-Ranges'] = 'bytes' diff --git a/web/libs/ui/src/assets/icons/cloud-provider-databricks.svg b/web/libs/ui/src/assets/icons/cloud-provider-databricks.svg new file mode 100644 index 000000000000..41633407f177 --- /dev/null +++ b/web/libs/ui/src/assets/icons/cloud-provider-databricks.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/libs/ui/src/assets/icons/index.ts b/web/libs/ui/src/assets/icons/index.ts index 135e3b0200c8..ba5ae22e4deb 100644 --- a/web/libs/ui/src/assets/icons/index.ts +++ b/web/libs/ui/src/assets/icons/index.ts @@ -261,3 +261,4 @@ export { ReactComponent as IconCloudProviderS3 } from "./cloud-provider-s3.svg"; export { ReactComponent as IconCloudProviderRedis } from "./cloud-provider-redis.svg"; export { ReactComponent as IconCloudProviderGCS } from "./cloud-provider-gcs.svg"; export { ReactComponent as IconCloudProviderAzure } from "./cloud-provider-azure.svg"; +export { ReactComponent as IconDatabricks } from "./cloud-provider-databricks.svg"; From a2bd2606254f8faf92a50168f596de6c77b11b2c Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 00:33:32 +0100 Subject: [PATCH 14/26] Fix name for icon --- web/libs/ui/src/assets/icons/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/libs/ui/src/assets/icons/index.ts b/web/libs/ui/src/assets/icons/index.ts index ba5ae22e4deb..2d22d9de089f 100644 --- a/web/libs/ui/src/assets/icons/index.ts +++ b/web/libs/ui/src/assets/icons/index.ts @@ -261,4 +261,4 @@ export { ReactComponent as IconCloudProviderS3 } from "./cloud-provider-s3.svg"; export { ReactComponent as IconCloudProviderRedis } from "./cloud-provider-redis.svg"; export { ReactComponent as IconCloudProviderGCS } from "./cloud-provider-gcs.svg"; export { ReactComponent as IconCloudProviderAzure } from "./cloud-provider-azure.svg"; -export { ReactComponent as IconDatabricks } from "./cloud-provider-databricks.svg"; +export { ReactComponent as IconCloudProviderDatabricks } from "./cloud-provider-databricks.svg"; From e7e03150b6b069be0a7f21760b1800daff6f0093 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 00:59:00 +0100 Subject: [PATCH 15/26] Add docs --- docs/source/guide/storage.md | 74 +++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/docs/source/guide/storage.md b/docs/source/guide/storage.md index 7627d744a3fa..b91a7ae6fe23 100644 --- a/docs/source/guide/storage.md +++ b/docs/source/guide/storage.md @@ -19,6 +19,8 @@ Set up the following cloud and other storage systems with Label Studio: - [Microsoft Azure Blob storage](#Microsoft-Azure-Blob-storage) - [Redis database](#Redis-database) - [Local storage](#Local-storage)
(for On-prem only)
+- [Databricks Files (UC Volumes)](#Databricks-Files-UC-Volumes)
(for Enterprise only)
+ ## Troubleshooting @@ -43,6 +45,7 @@ For more troubleshooting information, see [Troubleshooting Import, Export, & Sto + ## How external storage connections and sync work You can add source storage connections to sync data from an external source to a Label Studio project, and add target storage connections to sync annotations from Label Studio to external storage. Each source and target storage setup is project-specific. You can connect multiple buckets, containers, databases, or directories as source or target storage for a project. @@ -1483,7 +1486,76 @@ You can also create a storage connection using the Label Studio API. If you're using Label Studio in Docker, you need to mount the local directory that you want to access as a volume when you start the Docker container. See [Run Label Studio on Docker and use local storage](https://labelstud.io/guide/start#Run-Label-Studio-on-Docker-and-use-Local-Storage). -### Troubleshooting cloud storage + + +## Databricks Files (UC Volumes) + +
+ +Connect Label Studio Enterprise to Databricks Unity Catalog (UC) Volumes to import files as tasks and export annotations as JSON back to your volumes. This connector uses the Databricks Files API and operates only in proxy mode (no presigned URLs are supported by Databricks). + +### Prerequisites +- A Databricks workspace URL (Workspace Host), for example `https://adb-12345678901234.1.databricks.com` (or Azure domain) +- A Databricks Personal Access Token (PAT) with permission to access the Files API +- A UC Volume path under `/Volumes///` with files you want to label + +References: +- Databricks workspace: https://docs.databricks.com/en/getting-started/index.html +- Personal access tokens: https://docs.databricks.com/en/dev-tools/auth/pat.html +- Unity Catalog and Volumes: https://docs.databricks.com/en/files/volumes.html + +### Set up connection in the Label Studio UI +1. Open Label Studio → project → **Settings > Cloud Storage**. +2. Click **Add Source Storage**. Select **Databricks Files (UC Volumes)**. +3. Configure the connection: + - Workspace Host: your Databricks workspace base URL (no trailing slash) + - Access Token: your PAT + - Catalog / Schema / Volume: Unity Catalog coordinates + - Click **Next** to open Import Settings & Preview +4. Import Settings & Preview: + - Bucket Prefix (optional): relative subpath under the volume (e.g., `images/train`) + - File Name Filter (optional): regex to filter files (e.g., `.*\.json$`) + - Scan all sub-folders: enable for recursive listing; disable to list only current folder + - Click **Load preview** to verify files +5. Click **Save** (or **Save & Sync**) to create the connection and sync tasks. + +### Target storage (export) +1. Open **Settings > Cloud Storage** → **Add Target Storage** → **Databricks Files (UC Volumes)**. +2. Use the same Workspace Host/Token and UC coordinates. +3. Set an Export Prefix (e.g., `exports/${project_id}`). +4. Click **Save** and then **Sync** to push annotations as JSON files to your volume. + +### Security and proxy streaming +- This connector streams data through the Label Studio backend with HTTP Range support; no presigned URLs are generated. +- Credentials are stored server-side; tokens are write-only in the API and never returned to the client. +- Path traversal and catalog/schema/volume mismatches are rejected. + +### Troubleshooting +- If listing returns zero files, verify the path under `/Volumes////` and your PAT permissions. +- Ensure the Workspace Host has no trailing slash and matches your workspace domain. +- If previews work but media fails to load, confirm proxy mode is allowed for your organization and network egress allows Label Studio to reach Databricks. + +
+ +
+ +### Use Databricks Files in Label Studio Enterprise + +Databricks Unity Catalog (UC) Volumes integration is available in Label Studio Enterprise. It lets you: + +- Import files directly from UC Volumes under `/Volumes///` +- Stream media securely via the platform proxy (no presigned URLs) +- Export annotations back to your Databricks Volume as JSON + +Learn more and see the full setup guide in the Enterprise documentation: [Databricks Files (UC Volumes)](https://docs.humansignal.com/guide/storage#Databricks-Files-UC-Volumes). + +If your organization needs governed access to Databricks data with Unity Catalog and proxy streaming, consider [Label Studio Enterprise](https://humansignal.com/label-studio-enterprise/). + +
+ + + +## Troubleshooting cloud storage
From 02893d8bb397926edb734315b742e59408c26f44 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 13:08:32 +0100 Subject: [PATCH 16/26] Fix docs --- docs/source/guide/storage.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/source/guide/storage.md b/docs/source/guide/storage.md index b91a7ae6fe23..3bc9f8a96f3f 100644 --- a/docs/source/guide/storage.md +++ b/docs/source/guide/storage.md @@ -19,7 +19,7 @@ Set up the following cloud and other storage systems with Label Studio: - [Microsoft Azure Blob storage](#Microsoft-Azure-Blob-storage) - [Redis database](#Redis-database) - [Local storage](#Local-storage)
(for On-prem only)
-- [Databricks Files (UC Volumes)](#Databricks-Files-UC-Volumes)
(for Enterprise only)
+- [Databricks Files (UC Volumes)](#Databricks-Files-UC-Volumes) ## Troubleshooting @@ -1525,15 +1525,13 @@ References: 3. Set an Export Prefix (e.g., `exports/${project_id}`). 4. Click **Save** and then **Sync** to push annotations as JSON files to your volume. -### Security and proxy streaming -- This connector streams data through the Label Studio backend with HTTP Range support; no presigned URLs are generated. -- Credentials are stored server-side; tokens are write-only in the API and never returned to the client. -- Path traversal and catalog/schema/volume mismatches are rejected. +!!! warning "Proxy and security" + This connector streams data **through the Label Studio backend** with HTTP Range support. Databricks does not support presigned URLs, so this option is also not available in Label Studio. -### Troubleshooting -- If listing returns zero files, verify the path under `/Volumes////` and your PAT permissions. -- Ensure the Workspace Host has no trailing slash and matches your workspace domain. -- If previews work but media fails to load, confirm proxy mode is allowed for your organization and network egress allows Label Studio to reach Databricks. +!!! note + - If listing returns zero files, verify the path under `/Volumes////` and your PAT permissions. + - Ensure the Workspace Host has no trailing slash and matches your workspace domain. + - If previews work but media fails to load, confirm proxy mode is allowed for your organization in Label Studio and network egress allows Label Studio to reach Databricks.
From 0ad74ceb809f64e02ffa2374e42d5b758ed5a324 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 13:36:38 +0000 Subject: [PATCH 17/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17711932861 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index fb033e7d7693..2d22ac26d583 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "c08c1db419731bf8fedea87cda575c23a9f764a2.zip", hash = "sha256:6da563dcf37fb71c92f6043a035b22d6b24d816a1cfffa16e9a7b64ef5ea4b24"}, + {file = "564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip", hash = "sha256:743781731d853b705df6ed8adbb52e7b6cf22d444b0b36d1b51d825bd6086fd5"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/c08c1db419731bf8fedea87cda575c23a9f764a2.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "c152107325824bc9bd40076de20a31c0fa07161efa4f743c626fb75fb6eedc31" +content-hash = "996100c6b80f7fb6e14446ba90445df965e034df7a75f6ce46dcbe04f9424a6e" diff --git a/pyproject.toml b/pyproject.toml index 2386eb93732f..41544c325759 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/c08c1db419731bf8fedea87cda575c23a9f764a2.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip", ## HumanSignal repo dependencies :end ] From 7cc936b1ab1509c5a50a524b686090d45c6329fd Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 15:54:23 +0000 Subject: [PATCH 18/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17713381479 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2d22ac26d583..f561df806f7b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip", hash = "sha256:743781731d853b705df6ed8adbb52e7b6cf22d444b0b36d1b51d825bd6086fd5"}, + {file = "75991748869f6970e7a2f2d2c1081d6c030c8772.zip", hash = "sha256:0f4b336e3eafaaada07a3fb0fae27f1e3de3ded8392ad69dc38f7509dbac14da"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/75991748869f6970e7a2f2d2c1081d6c030c8772.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "996100c6b80f7fb6e14446ba90445df965e034df7a75f6ce46dcbe04f9424a6e" +content-hash = "5439624ba7fa392fe9f38c6711df622419b0a9eb919940ca54d100b454394458" diff --git a/pyproject.toml b/pyproject.toml index 41544c325759..0ad8a7f2ecc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/564ddc5d2d5ff26919f07cb65e2d1ab29c4b52b7.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/75991748869f6970e7a2f2d2c1081d6c030c8772.zip", ## HumanSignal repo dependencies :end ] From 07801b7abf6dc488122036325db8735ab096b77b Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 15:57:41 +0000 Subject: [PATCH 19/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17713411292 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index f561df806f7b..9570ebdb2109 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "75991748869f6970e7a2f2d2c1081d6c030c8772.zip", hash = "sha256:0f4b336e3eafaaada07a3fb0fae27f1e3de3ded8392ad69dc38f7509dbac14da"}, + {file = "13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip", hash = "sha256:debcf96b6e92f01acd43174cff38d9ef39e20b31f70be7e8c21978119243b545"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/75991748869f6970e7a2f2d2c1081d6c030c8772.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "5439624ba7fa392fe9f38c6711df622419b0a9eb919940ca54d100b454394458" +content-hash = "786552c264d61d7bb72e838cdbe1b5e81061d815ad1264e896855495bf0a131f" diff --git a/pyproject.toml b/pyproject.toml index 0ad8a7f2ecc6..c0cbf2aada63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/75991748869f6970e7a2f2d2c1081d6c030c8772.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip", ## HumanSignal repo dependencies :end ] From ea2335867458b72e973b4b1b20ae06808119b227 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 19:33:29 +0100 Subject: [PATCH 20/26] Add path in docs --- docs/source/guide/storage.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/source/guide/storage.md b/docs/source/guide/storage.md index 3bc9f8a96f3f..f5486f63b1f1 100644 --- a/docs/source/guide/storage.md +++ b/docs/source/guide/storage.md @@ -1525,14 +1525,28 @@ References: 3. Set an Export Prefix (e.g., `exports/${project_id}`). 4. Click **Save** and then **Sync** to push annotations as JSON files to your volume. -!!! warning "Proxy and security" - This connector streams data **through the Label Studio backend** with HTTP Range support. Databricks does not support presigned URLs, so this option is also not available in Label Studio. +!!! note "URI schema" + To reference Databricks files directly in task JSON (without using an Import Storage), use Label Studio’s Databricks URI scheme: + + `dbx://Volumes////` + + Example: + + ``` + { "image": "dbx://Volumes/main/default/dataset/images/1.jpg" } + ``` -!!! note + +!!! note "Troubleshooting" - If listing returns zero files, verify the path under `/Volumes////` and your PAT permissions. - Ensure the Workspace Host has no trailing slash and matches your workspace domain. - If previews work but media fails to load, confirm proxy mode is allowed for your organization in Label Studio and network egress allows Label Studio to reach Databricks. + +!!! warning "Proxy and security" + This connector streams data **through the Label Studio backend** with HTTP Range support. Databricks does not support presigned URLs, so this option is also not available in Label Studio. + +
From 4587772219f97d96ebcf5c1c4da56acff5a47571 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 20:46:35 +0100 Subject: [PATCH 21/26] Fix limits for file previews --- docs/source/guide/storage.md | 4 +--- label_studio/io_storages/api.py | 2 +- .../src/blocks/StorageProviderForm/hooks/useStorageApi.ts | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/source/guide/storage.md b/docs/source/guide/storage.md index f5486f63b1f1..b18d8877234c 100644 --- a/docs/source/guide/storage.md +++ b/docs/source/guide/storage.md @@ -1559,9 +1559,7 @@ Databricks Unity Catalog (UC) Volumes integration is available in Label Studio E - Stream media securely via the platform proxy (no presigned URLs) - Export annotations back to your Databricks Volume as JSON -Learn more and see the full setup guide in the Enterprise documentation: [Databricks Files (UC Volumes)](https://docs.humansignal.com/guide/storage#Databricks-Files-UC-Volumes). - -If your organization needs governed access to Databricks data with Unity Catalog and proxy streaming, consider [Label Studio Enterprise](https://humansignal.com/label-studio-enterprise/). +Learn more and see the full setup guide in the Enterprise documentation: [Databricks Files (UC Volumes)](https://docs.humansignal.com/guide/storage#Databricks-Files-UC-Volumes). If your organization needs governed access to Databricks data with Unity Catalog, consider [Label Studio Enterprise](https://humansignal.com/).
diff --git a/label_studio/io_storages/api.py b/label_studio/io_storages/api.py index 09b780a9bda3..6fc6c786417d 100644 --- a/label_studio/io_storages/api.py +++ b/label_studio/io_storages/api.py @@ -174,7 +174,7 @@ def create(self, request, *args, **kwargs): from .functions import validate_storage_instance instance = validate_storage_instance(request, self.serializer_class) - limit = request.data.get('limit', settings.DEFAULT_STORAGE_LIST_LIMIT) + limit = int(request.GET.get('limit', settings.DEFAULT_STORAGE_LIST_LIMIT)) try: files = [] diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts b/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts index e6c9418e5356..79d4d9eebfa1 100644 --- a/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts +++ b/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts @@ -191,7 +191,7 @@ export const useStorageApi = ({ target, storage, project, onSubmit, onClose }: U return api.callApi<{ files: any[] }>("storageFiles", { params: { - limit: 10, + limit: 30, target, type: previewData.provider, }, From dc720438ee984d8b74ddb14bfe35cdb378a64225 Mon Sep 17 00:00:00 2001 From: makseq Date: Sun, 14 Sep 2025 23:20:05 +0100 Subject: [PATCH 22/26] Fix limit again --- label_studio/io_storages/api.py | 2 +- .../src/blocks/StorageProviderForm/hooks/useStorageApi.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/label_studio/io_storages/api.py b/label_studio/io_storages/api.py index 6fc6c786417d..c45b34171edc 100644 --- a/label_studio/io_storages/api.py +++ b/label_studio/io_storages/api.py @@ -174,7 +174,7 @@ def create(self, request, *args, **kwargs): from .functions import validate_storage_instance instance = validate_storage_instance(request, self.serializer_class) - limit = int(request.GET.get('limit', settings.DEFAULT_STORAGE_LIST_LIMIT)) + limit = int(request.data.get('limit', settings.DEFAULT_STORAGE_LIST_LIMIT)) try: files = [] diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts b/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts index 79d4d9eebfa1..08c4ef7bdc31 100644 --- a/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts +++ b/web/libs/app-common/src/blocks/StorageProviderForm/hooks/useStorageApi.ts @@ -187,11 +187,11 @@ export const useStorageApi = ({ target, storage, project, onSubmit, onClose }: U if (isDefined(storage?.id)) { body.id = storage.id; + body.limit = 30; } return api.callApi<{ files: any[] }>("storageFiles", { params: { - limit: 30, target, type: previewData.provider, }, From 8ca7d22b481ff594c18d2bd1acf111e2d299249b Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 22:32:26 +0000 Subject: [PATCH 23/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17717501145 From 835a379924b13e4f362c508c354b71455a1f821e Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 22:48:11 +0000 Subject: [PATCH 24/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17717656097 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9570ebdb2109..bc7022900fd5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip", hash = "sha256:debcf96b6e92f01acd43174cff38d9ef39e20b31f70be7e8c21978119243b545"}, + {file = "b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip", hash = "sha256:aaab25beb777511ea41a3e761e46c0fba3768506e7a50dca51150f86e49091e2"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "786552c264d61d7bb72e838cdbe1b5e81061d815ad1264e896855495bf0a131f" +content-hash = "6c2df350ecaf0c14d9a7ae8a38620fdc6397acc223805c4a5fcdfb8884614d33" diff --git a/pyproject.toml b/pyproject.toml index c0cbf2aada63..d9a6552d6335 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/13b2362044211599a0c026c2dd4bbdae9cc95ae2.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip", ## HumanSignal repo dependencies :end ] From 3858a8c94c2a49651cb45dab9e85b061ece79d14 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 22:58:17 +0000 Subject: [PATCH 25/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17717740567 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index bc7022900fd5..12fe56500216 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip", hash = "sha256:aaab25beb777511ea41a3e761e46c0fba3768506e7a50dca51150f86e49091e2"}, + {file = "94e1c419d41f0c307360410e5cc7566135342934.zip", hash = "sha256:bbb71b32776388678dd58deecb9c26084704b5d9c645eaa6bb44194fb5ba3ffd"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/94e1c419d41f0c307360410e5cc7566135342934.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "6c2df350ecaf0c14d9a7ae8a38620fdc6397acc223805c4a5fcdfb8884614d33" +content-hash = "2397b773c1bf290209ecdb00ab9014a0da770605209411b997ceb3710515e17b" diff --git a/pyproject.toml b/pyproject.toml index d9a6552d6335..3f393dbcb003 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/b1e30f1bcdb3592a631a793b032ef9fd814e9618.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/94e1c419d41f0c307360410e5cc7566135342934.zip", ## HumanSignal repo dependencies :end ] From 3465f494bc72419c29bf8725035d3a4c1a01a3f8 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Sun, 14 Sep 2025 23:05:05 +0000 Subject: [PATCH 26/26] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/17717804535 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 12fe56500216..5194a477f4c8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2136,7 +2136,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "94e1c419d41f0c307360410e5cc7566135342934.zip", hash = "sha256:bbb71b32776388678dd58deecb9c26084704b5d9c645eaa6bb44194fb5ba3ffd"}, + {file = "77b0c0abd2847c914096e6054b6f1b1805ee1b7a.zip", hash = "sha256:a29f9b9db793edfb97cbf384a2c9aac780656ca37893c998e412577f4e9277ad"}, ] [package.dependencies] @@ -2164,7 +2164,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/94e1c419d41f0c307360410e5cc7566135342934.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/77b0c0abd2847c914096e6054b6f1b1805ee1b7a.zip" [[package]] name = "launchdarkly-server-sdk" @@ -5109,4 +5109,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "2397b773c1bf290209ecdb00ab9014a0da770605209411b997ceb3710515e17b" +content-hash = "27d1096f39b9864ce20797fcaabcdf470de9b87aedf0b2bfe0b7b109096320a0" diff --git a/pyproject.toml b/pyproject.toml index 3f393dbcb003..cb1a83035a58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "tldextract (>=5.1.3)", "uuid-utils (>=0.11.0,<1.0.0)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/94e1c419d41f0c307360410e5cc7566135342934.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/77b0c0abd2847c914096e6054b6f1b1805ee1b7a.zip", ## HumanSignal repo dependencies :end ]