Merge branch 'main' into ilongin/1519-flush-insert-buffer-based-on-time

ilongin · ilongin · commit 7a8c5ee19786 · 2026-01-29T11:23:49.000+01:00
diff --git a/docs/commands/job/run.md b/docs/commands/job/run.md
@@ -14,7 +14,7 @@ usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
                          [--req-file REQ_FILE] [--req REQ [REQ ...]]
                          [--priority PRIORITY]
                          [--start-time START_TIME] [--cron CRON]
-                         [--no-wait]
+                         [--no-wait] [--ignore-checkpoints]
                          file
 ```
 
@@ -43,6 +43,7 @@ This command runs a job in Studio using the specified query file. You can config
 * `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
 * `--cron CRON` - Cron expression for the cron task.
 * `--no-wait` - Do not wait for the job to finish.
+* `--ignore-checkpoints` - Ignore existing checkpoints and run from scratch.
 * `-h`, `--help` - Show the help message and exit.
 * `-v`, `--verbose` - Be verbose.
 * `-q`, `--quiet` - Be quiet.
@@ -156,6 +157,7 @@ datachain job run query.py --no-wait
 
 ## Notes
 
+* **Checkpoints**: Running the same script multiple times via `datachain job run` automatically links jobs together, enabling checkpoint reuse. If a previous run of the same script (by absolute path) exists, DataChain will resume from where it left off.
 * Closing the logs command (e.g., with Ctrl+C) will only stop displaying the logs but will not cancel the job execution
 * To cancel a running job, use the `datachain job cancel` command
 * The job will continue running in Studio even after you stop viewing the logs
diff --git a/docs/guide/checkpoints.md b/docs/guide/checkpoints.md
@@ -20,13 +20,24 @@ This means that if your script creates multiple datasets and fails partway throu
 
 ### Studio Runs
 
-When running jobs on Studio, the checkpoint workflow is managed through the UI:
+#### Using `datachain job run` CLI
+
+When you run `datachain job run my_script.py`, DataChain automatically:
+
+1. **Links jobs** by finding previous runs of the same script (by absolute path) that were also executed in Studio
+2. **Passes checkpoint context** to Studio, enabling checkpoint reuse across runs
+
+This means running the same script multiple times via `datachain job run` will automatically benefit from checkpoints without any additional configuration.
+
+#### Using Studio UI
+
+When triggering jobs through the Studio interface:
 
 1. **Job execution** is triggered using the Run button in the Studio interface
 2. **Checkpoint control** is explicit - you choose between:
    - **Run from scratch**: Ignores any existing checkpoints and recreates all datasets
    - **Continue from last checkpoint**: Resumes from the last successful checkpoint, skipping already-completed stages
-3. **Parent-child job linking** is handled automatically by the system - no need for script path matching or job name conventions
+3. **Parent-child job linking** is handled automatically by the system
 4. **Checkpoint behavior** during execution is the same as local runs: datasets are saved at each `.save()` call and can be reused on retry
 
 
diff --git a/src/datachain/cli/parser/job.py b/src/datachain/cli/parser/job.py
@@ -122,6 +122,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         action="store_true",
         help="Do not wait for the job to finish",
     )
+    studio_run_parser.add_argument(
+        "--ignore-checkpoints",
+        action="store_true",
+        help="Ignore existing checkpoints and run from scratch",
+    )
 
     studio_ls_help = "List jobs in Studio"
     studio_ls_description = "List jobs in Studio."
diff --git a/src/datachain/data_storage/metastore.py b/src/datachain/data_storage/metastore.py
@@ -474,6 +474,8 @@ def create_job(
         parent_job_id: str | None = None,
         rerun_from_job_id: str | None = None,
         run_group_id: str | None = None,
+        is_remote_execution: bool = False,
+        job_id: str | None = None,
     ) -> str:
         """
         Creates a new job.
@@ -511,7 +513,9 @@ def get_job_status(self, job_id: str) -> JobStatus | None:
         """Returns the status of the given job."""
 
     @abstractmethod
-    def get_last_job_by_name(self, name: str, conn=None) -> "Job | None":
+    def get_last_job_by_name(
+        self, name: str, is_remote_execution: bool = False, conn=None
+    ) -> "Job | None":
         """Returns the last job with the given name, ordered by created_at."""
 
     #
@@ -1877,6 +1881,7 @@ def _jobs_columns() -> "list[SchemaItem]":
             Column("parent_job_id", Text, nullable=True),
             Column("rerun_from_job_id", Text, nullable=True),
             Column("run_group_id", Text, nullable=True),
+            Column("is_remote_execution", Boolean, nullable=False, default=False),
             Index("idx_jobs_parent_job_id", "parent_job_id"),
             Index("idx_jobs_rerun_from_job_id", "rerun_from_job_id"),
             Index("idx_jobs_run_group_id", "run_group_id"),
@@ -1918,10 +1923,13 @@ def list_jobs_by_ids(self, ids: list[str], conn=None) -> Iterator["Job"]:
         query = self._jobs_query().where(self._jobs.c.id.in_(ids))
         yield from self._parse_jobs(self.db.execute(query, conn=conn))
 
-    def get_last_job_by_name(self, name: str, conn=None) -> "Job | None":
+    def get_last_job_by_name(
+        self, name: str, is_remote_execution: bool = False, conn=None
+    ) -> "Job | None":
         query = (
             self._jobs_query()
             .where(self._jobs.c.name == name)
+            .where(self._jobs.c.is_remote_execution == is_remote_execution)
             .order_by(self._jobs.c.created_at.desc())
             .limit(1)
         )
@@ -1942,29 +1950,35 @@ def create_job(
         parent_job_id: str | None = None,
         rerun_from_job_id: str | None = None,
         run_group_id: str | None = None,
+        is_remote_execution: bool = False,
+        job_id: str | None = None,
         conn: Any = None,
     ) -> str:
         """
         Creates a new job.
         Returns the job id.
+
+        Args:
+            job_id: If provided, uses this ID instead of generating a new one.
+                    Used for saving Studio jobs locally with their original IDs.
         """
-        job_id = str(uuid4())
-
-        # Validate run_group_id and rerun_from_job_id consistency
-        if rerun_from_job_id:
-            # Rerun job: run_group_id should be provided by caller
-            # If run_group_id is None, parent is a legacy job without run_group_id
-            # In this case, treat current job as first job in a new chain
-            # and break the link to the legacy parent
-            if run_group_id is None:
+        if job_id is None:
+            job_id = str(uuid4())
+            # Validate run_group_id and rerun_from_job_id consistency for local jobs
+            if rerun_from_job_id:
+                # Rerun job: run_group_id should be provided by caller
+                # If run_group_id is None, parent is a legacy job without run_group_id
+                # In this case, treat current job as first job in a new chain
+                # and break the link to the legacy parent
+                if run_group_id is None:
+                    run_group_id = job_id
+                    rerun_from_job_id = None
+            else:
+                assert run_group_id is None, (
+                    "run_group_id should not be provided when rerun_from_job_id"
+                    " is not set"
+                )
                 run_group_id = job_id
-                rerun_from_job_id = None
-        else:
-            # First job: run_group_id should not be provided (we set it here)
-            assert run_group_id is None, (
-                "run_group_id should not be provided when rerun_from_job_id is not set"
-            )
-            run_group_id = job_id
 
         self.db.execute(
             self._jobs_insert().values(
@@ -1983,6 +1997,7 @@ def create_job(
                 parent_job_id=parent_job_id,
                 rerun_from_job_id=rerun_from_job_id,
                 run_group_id=run_group_id,
+                is_remote_execution=is_remote_execution,
             ),
             conn=conn,
         )
diff --git a/src/datachain/job.py b/src/datachain/job.py
@@ -26,9 +26,10 @@ class Job:
     parent_job_id: str | None = None
     rerun_from_job_id: str | None = None
     run_group_id: str | None = None
+    is_remote_execution: bool = False
 
     @classmethod
-    def parse(
+    def parse(  # noqa: PLR0913
         cls,
         id: str | uuid.UUID,
         name: str,
@@ -46,6 +47,7 @@ def parse(
         parent_job_id: str | None,
         rerun_from_job_id: str | None,
         run_group_id: str | None,
+        is_remote_execution: bool = False,
     ) -> "Job":
         return cls(
             str(id),
@@ -64,4 +66,5 @@ def parse(
             str(parent_job_id) if parent_job_id else None,
             str(rerun_from_job_id) if rerun_from_job_id else None,
             str(run_group_id) if run_group_id else None,
+            is_remote_execution,
         )
diff --git a/src/datachain/remote/studio.py b/src/datachain/remote/studio.py
@@ -452,6 +452,8 @@ def create_job(
         environment: str | None = None,
         workers: int | None = None,
         query_name: str | None = None,
+        rerun_from_job_id: str | None = None,
+        reset: bool = False,
         files: list[str] | None = None,
         python_version: str | None = None,
         requirements: str | None = None,
@@ -468,6 +470,8 @@ def create_job(
             "environment": environment,
             "workers": workers,
             "query_name": query_name,
+            "rerun_from_job_id": rerun_from_job_id,
+            "reset": reset,
             "files": files,
             "python_version": python_version,
             "requirements": requirements,
diff --git a/src/datachain/studio.py b/src/datachain/studio.py
@@ -9,8 +9,9 @@
 import requests
 import tabulate
 
+from datachain.catalog import get_catalog
 from datachain.config import Config, ConfigLevel
-from datachain.data_storage.job import JobStatus
+from datachain.data_storage.job import JobQueryType, JobStatus
 from datachain.dataset import (
     QUERY_DATASET_PREFIX,
     parse_dataset_name,
@@ -58,6 +59,7 @@ def process_jobs_args(args: "Namespace"):
             args.cron,
             args.no_wait,
             args.credentials_name,
+            args.ignore_checkpoints,
         )
 
     if args.cmd == "cancel":
@@ -422,7 +424,7 @@ async def _run():
     return exit_code_by_status.get(final_status.upper(), 0) if final_status else 0
 
 
-def create_job(
+def create_job(  # noqa: PLR0913
     query_file: str,
     team_name: str | None,
     env_file: str | None = None,
@@ -439,7 +441,10 @@ def create_job(
     cron: str | None = None,
     no_wait: bool | None = False,
     credentials_name: str | None = None,
+    ignore_checkpoints: bool = False,
 ):
+    catalog = get_catalog()
+
     query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
     with open(query_file) as f:
         query = f.read()
@@ -455,6 +460,15 @@ def create_job(
         with open(req_file) as f:
             requirements = f.read() + "\n" + requirements
 
+    script_path = os.path.abspath(query_file)
+
+    rerun_from_job_id = None
+    rerun_from_job = catalog.metastore.get_last_job_by_name(
+        script_path, is_remote_execution=True
+    )
+    if rerun_from_job:
+        rerun_from_job_id = rerun_from_job.id
+
     client = StudioClient(team=team_name)
     file_ids = upload_files(client, files) if files else []
 
@@ -469,6 +483,8 @@ def create_job(
         environment=environment,
         workers=workers,
         query_name=os.path.basename(query_file),
+        rerun_from_job_id=rerun_from_job_id,
+        reset=ignore_checkpoints,
         files=file_ids,
         python_version=python_version,
         repository=repository,
@@ -486,13 +502,34 @@ def create_job(
         raise DataChainError("Failed to create job")
 
     job_id = response.data.get("id")
+    job_data = response.data
+
+    query_type_value = (
+        JobQueryType.PYTHON if query_type == "PYTHON" else JobQueryType.SHELL
+    )
+    catalog.metastore.create_job(
+        name=script_path,  # Use local script path, not Studio's query_name
+        query=query,
+        query_type=query_type_value,
+        status=JobStatus.CREATED,
+        workers=job_data.get("workers", 0),
+        python_version=job_data.get("python_version"),
+        params=job_data.get("params", {}),
+        parent_job_id=job_data.get("parent_job_id"),
+        rerun_from_job_id=job_data.get("rerun_from_job_id"),
+        run_group_id=job_data.get("run_group_id"),
+        is_remote_execution=True,
+        job_id=str(job_id),  # Use Studio's job ID
+    )
+
+    catalog.close()
 
     if parsed_start_time or cron:
         print(f"Job {job_id} is scheduled as a task in Studio.")
         return 0
 
     print(f"Job {job_id} created")
-    print("Open the job in Studio at", response.data.get("url"))
+    print("Open the job in Studio at", job_data.get("url"))
     print("=" * 40)
 
     return 0 if no_wait else show_logs_from_client(client, job_id)
diff --git a/tests/test_cli_studio.py b/tests/test_cli_studio.py