audiohacking · lmangani · Feb 8, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/api/generate.py b/api/generate.py
@@ -5,12 +5,22 @@
 
 import json
 import logging
+import re
 import threading
 import time
 import uuid
 from pathlib import Path
 from flask import Blueprint, jsonify, request, send_file
 
+def _uppercase_track_in_instruction(instruction):
+    """Uppercase TRACK_NAME in 'Generate the X track ...' to match ACE-Step (cli.py _default_instruction_for_task)."""
+    if not instruction or " track " not in instruction:
+        return instruction
+    m = re.search(r"(\bthe\s+)(\w+)(\s+track\b)", instruction, re.IGNORECASE)
+    if m:
+        return instruction[: m.start(2)] + m.group(2).upper() + instruction[m.end(2) :]
+    return instruction
+
 from cdmf_paths import get_output_dir, get_user_data_dir, load_config
 from cdmf_tracks import list_lora_adapters, load_track_meta, save_track_meta
 from cdmf_generation_job import GenerationCancelled
@@ -122,22 +132,32 @@ def _run_generation(job_id: str) -> None:
         if task not in allowed_tasks:
             task = "text2music"
         # Single style/caption field drives all text conditioning (ACE-Step caption).
-        # Simple mode: songDescription. Advanced mode: style. Both can have key, time sig, vocal language.
-        prompt = (params.get("style") or "").strip() if custom_mode else (params.get("songDescription") or "").strip()
+        # Simple mode: songDescription. Advanced mode: style. Lego/extract/complete: instruction + caption only (no metas; source sets context).
+        if task in ("lego", "extract", "complete"):
+            instruction = (params.get("instruction") or "").strip()
+            caption = (params.get("style") or "").strip()
+            if not instruction and not caption:
+                instruction = "Generate an instrument track based on the audio context:"
+            prompt = None  # built below after we have duration/bpm/metas
+        else:
+            instruction = None
+            caption = None
+            prompt = (params.get("style") or "").strip() if custom_mode else (params.get("songDescription") or "").strip()
         key_scale = (params.get("keyScale") or "").strip()
         time_sig = (params.get("timeSignature") or "").strip()
         vocal_lang = (params.get("vocalLanguage") or "").strip().lower()
         extra_bits = []
-        if key_scale:
-            extra_bits.append(f"key {key_scale}")
-        if time_sig:
-            extra_bits.append(f"time signature {time_sig}")
-        if vocal_lang and vocal_lang not in ("unknown", ""):
-            extra_bits.append(f"vocal language {vocal_lang}")
-        if extra_bits:
-            prompt = f"{prompt}, {', '.join(extra_bits)}" if prompt else ", ".join(extra_bits)
-        # When user explicitly chose English, reinforce in caption so model conditions on it
-        if vocal_lang == "en" and prompt:
+        if task != "lego":
+            if key_scale:
+                extra_bits.append(f"key {key_scale}")
+            if time_sig:
+                extra_bits.append(f"time signature {time_sig}")
+            if vocal_lang and vocal_lang not in ("unknown", ""):
+                extra_bits.append(f"vocal language {vocal_lang}")
+            if extra_bits:
+                prompt = f"{prompt}, {', '.join(extra_bits)}" if prompt else ", ".join(extra_bits)
+        # When user explicitly chose English, reinforce in caption so model conditions on it (skip for lego)
+        if task != "lego" and vocal_lang == "en" and prompt:
             if not prompt.lower().startswith("english"):
                 prompt = f"English vocals, {prompt}"
         if not prompt:
@@ -190,21 +210,32 @@ def _run_generation(job_id: str) -> None:
                     bpm = None
             except (TypeError, ValueError):
                 bpm = None
+        # Lego/extract/complete: instruction (uppercase track) + caption only. No metas — BPM/key/timesignature
+        # should match the input backing; passing them would be for cover/target-style mode.
+        if task in ("lego", "extract", "complete"):
+            instruction = _uppercase_track_in_instruction(
+                instruction or "Generate an instrument track based on the audio context:"
+            )
+            prompt = (instruction + "\n\n" + (caption or "")).strip() or instruction
         title = (params.get("title") or "Untitled").strip() or "Track"
         reference_audio_url = (params.get("referenceAudioUrl") or params.get("reference_audio_path") or "").strip()
         source_audio_url = (params.get("sourceAudioUrl") or params.get("src_audio_path") or "").strip()
-        # For cover/retake use source-first (song to cover); for style/reference use reference-first
-        if task in ("cover", "retake"):
+        # For cover/retake/lego use source-first (backing/song to cover); for style/reference use reference-first
+        if task in ("cover", "retake", "lego"):
             resolved = _resolve_audio_url_to_path(source_audio_url) if source_audio_url else None
             src_audio_path = resolved or (_resolve_audio_url_to_path(reference_audio_url) if reference_audio_url else None)
         else:
             resolved = _resolve_audio_url_to_path(reference_audio_url) if reference_audio_url else None
             src_audio_path = resolved or (_resolve_audio_url_to_path(source_audio_url) if source_audio_url else None)
 
-        # When reference/source audio is provided, enable Audio2Audio so ACE-Step uses it (cover/retake/repaint).
-        # See docs/ACE-Step-INFERENCE.md: audio_cover_strength 1.0 = strong adherence; 0.5–0.8 = more caption influence.
+        # When reference/source audio is provided, enable Audio2Audio so ACE-Step uses it (cover/retake/repaint/lego).
+        # Lego/extract/complete: use ref_audio so the model gets backing as context; use LOW ref_audio_strength
+        # (e.g. 0.3) so diffusion starts from noisy backing and denoises toward the prompt (new instrument), not a copy.
+        # See docs/ACE-Step-INFERENCE.md: audio_cover_strength 1.0 = strong adherence; lower = more prompt influence.
         audio2audio_enable = bool(src_audio_path)
         ref_default = 0.8 if task in ("cover", "retake", "audio2audio") else 0.7
+        if task in ("lego", "extract", "complete"):
+            ref_default = 0.3  # low strength so output follows prompt (instrument) while matching backing timing
         ref_audio_strength = float(params.get("audioCoverStrength") or params.get("ref_audio_strength") or ref_default)
         ref_audio_strength = max(0.0, min(1.0, ref_audio_strength))
 
@@ -231,6 +262,10 @@ def _run_generation(job_id: str) -> None:
         thinking = bool(params.get("thinking", False))
         use_cot_metas = bool(params.get("useCotMetas", True))
         use_cot_caption = bool(params.get("useCotCaption", True))
+        # Lego/extract/complete: instruction must stay verbatim ("Generate the X track based on the audio context:").
+        # LM refinement would rephrase and can drop the track-type instruction, so disable CoT caption for these tasks.
+        if task in ("lego", "extract", "complete"):
+            use_cot_caption = False
         use_cot_language = bool(params.get("useCotLanguage", True))
         try:
             lm_temperature = float(params.get("lmTemperature") or params.get("lm_temperature") or 0.85)
@@ -403,7 +438,18 @@ def create_job():
         data = raw if isinstance(raw, dict) else {}
         logging.info("[API generate] Request body keys: %s", list(data.keys()) if data else [])
 
-        if not data.get("customMode") and not data.get("songDescription"):
+        task_for_validation = (data.get("taskType") or "text2music").strip().lower()
+        base_only_tasks = ("lego", "extract", "complete")
+        if task_for_validation in base_only_tasks:
+            # Lego/extract/complete: require source audio and caption/instruction (no songDescription)
+            src_audio = (data.get("sourceAudioUrl") or data.get("source_audio_path") or "").strip()
+            instruction = (data.get("instruction") or "").strip()
+            style = (data.get("style") or "").strip()
+            if not src_audio:
+                return jsonify({"error": "Backing/source audio required for Lego (and extract/complete)"}), 400
+            if not instruction and not style:
+                return jsonify({"error": "Describe the track (caption) or instruction required for Lego"}), 400
+        elif not data.get("customMode") and not data.get("songDescription"):
             return jsonify({"error": "Song description required for simple mode"}), 400
         # Custom mode: require at least one of style, lyrics, reference audio, or source audio
         if data.get("customMode"):

diff --git a/cdmf_pipeline_ace_step.py b/cdmf_pipeline_ace_step.py
@@ -2052,23 +2052,38 @@ def __call__(
         preprocess_time_cost = end_time - start_time
         start_time = end_time
 
+        # Lego/extract/complete: generate NEW track from prompt only; use source only for duration (no repaint/retake).
+        # Repaint/retake/extend: use src_latents in diffusion.
         add_retake_noise = task in ("retake", "repaint", "extend")
         # retake equal to repaint
         if task == "retake":
             repaint_start = 0
             repaint_end = audio_duration
+        if task in ("lego", "extract", "complete"):
+            repaint_start = 0
+            repaint_end = audio_duration
 
         src_latents = None
         if src_audio_path is not None:
             assert src_audio_path is not None and task in (
                 "repaint",
                 "edit",
                 "extend",
-            ), "src_audio_path is required for retake/repaint/extend task"
+                "lego",
+                "extract",
+                "complete",
+            ), "src_audio_path is required for repaint/extend/lego/extract/complete task"
             assert os.path.exists(
                 src_audio_path
             ), f"src_audio_path {src_audio_path} does not exist"
-            src_latents = self.infer_latents(src_audio_path)
+            src_latents_inferred = self.infer_latents(src_audio_path)
+            if task in ("lego", "extract", "complete"):
+                # Use source only to set output duration; do not pass latents into diffusion (generate from scratch with prompt).
+                num_frames = src_latents_inferred.shape[-1]
+                audio_duration = num_frames * 512 * 8 / 44100.0
+                src_latents = None  # no repaint for lego
+            else:
+                src_latents = src_latents_inferred
 
         ref_latents = None
         if ref_audio_input is not None and audio2audio_enable:

diff --git a/docs/ACEFORGE_API.md b/docs/ACEFORGE_API.md
@@ -125,8 +125,9 @@ ACE-Step text-to-music (and related tasks). Jobs are queued and run one at a tim
 - `inferenceSteps`: int (e.g. 55).
 - `guidanceScale`: float (e.g. 6.0).
 - `seed`: int; if `randomSeed` is true, server may override with random.
-- `taskType`: `"text2music"` | `"retake"` | `"repaint"` | `"extend"` | `"cover"` | `"audio2audio"`.
-- `referenceAudioUrl`, `sourceAudioUrl`: URLs like `/audio/refs/...` or `/audio/<filename>` for reference/cover.
+- `taskType`: `"text2music"` | `"retake"` | `"repaint"` | `"extend"` | `"cover"` | `"audio2audio"` | `"lego"` | `"extract"` | `"complete"`. **Lego**, **extract**, and **complete** require the ACE-Step **Base** DiT model (see Preferences and ACE-Step models).
+- `instruction`: optional; for `taskType` **lego** (and extract/complete), task-specific instruction (e.g. `"Generate the guitar track based on the audio context:"`). If omitted for lego, the server builds one from track name/caption.
+- `referenceAudioUrl`, `sourceAudioUrl`: URLs like `/audio/refs/...` or `/audio/<filename>` for reference/cover. For **lego**, **extract**, and **complete**, **sourceAudioUrl** is the backing/source audio (required).
 - `audioCoverStrength` / `ref_audio_strength`: 0–1.
 - `repaintingStart`, `repaintingEnd`: for repaint task.
 - `title`: base name for output file.
@@ -135,6 +136,8 @@ ACE-Step text-to-music (and related tasks). Jobs are queued and run one at a tim
 - `loraNameOrPath`: optional; folder name from LoRA list or path to adapter (see `GET /api/generate/lora_adapters`).
 - `loraWeight`: optional; 0–2, default 0.75.
 
+**Base-only tasks (lego, extract, complete):** Require `ace_step_dit_model: "base"` in preferences and the Base model to be installed (Settings or `GET /api/ace-step/models`). For **lego**: send `taskType: "lego"`, `sourceAudioUrl` (backing audio), `instruction` (e.g. `"Generate the <track> track based on the audio context:"`), and `style` as the track description (caption). Supported track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, `keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. See `docs/ACE-Step-INFERENCE.md` for extract/complete parameters.
+
 **Response (POST):** `{ "jobId": "<uuid>", "status": "queued", "queuePosition": 1 }`
 
 **Status response:** `{ "jobId", "status": "queued"|"running"|"succeeded"|"failed"|"cancelled", "queuePosition"?, "etaSeconds"?, "result"?, "error"? }`. On success, `result` includes e.g. `audioUrls`, `duration`, `status`. Cancelled jobs have `status: "cancelled"` and `error: "Cancelled by user"`.
@@ -203,11 +206,12 @@ List available DiT/LM models and trigger downloads. **The ACE-Step 1.5 downloade
 
 | Method | Path | Description |
 |--------|------|-------------|
-| GET | `/api/ace-step/models` | List DiT and LM models with `installed` status, plus `discovered_models`: all model directories found under the checkpoints folder (including custom trained models). Response includes `dit_models`, `lm_models`, `discovered_models` (id, label, path, custom), `acestep_download_available`, `checkpoints_path`. |
-| POST | `/api/ace-step/models/download` | Start download. Body: `{ "model": "turbo" | "turbo-shift1" | "sft" | "base" | "0.6B" | "1.7B" | "4B" }`. Uses bundled downloader (or `acestep-download` on PATH if not bundled). Returns `{ "ok", "model", "path" }` or `{ "error", "hint" }`. |
-| GET | `/api/ace-step/models/status` | Download progress: `{ "running", "model", "progress", "error" }`. |
+| GET | `/api/ace-step/models` | List DiT and LM models with `installed` status, plus `discovered_models`: all model directories found under the checkpoints folder (including custom trained models). Response includes `dit_models`, `lm_models`, `discovered_models` (id, label, path, custom), `acestep_download_available`, `checkpoints_path`. Use this to verify the **Base** model is installed before starting a lego/extract/complete job. |
+| POST | `/api/ace-step/models/download` | Start download. Body: `{ "model": "turbo" \| "turbo-shift1" \| "turbo-shift3" \| "turbo-continuous" \| "sft" \| "base" \| "0.6B" \| "1.7B" \| "4B" }`. Uses bundled downloader (or `acestep-download` on PATH if not bundled). Returns `{ "ok", "model", "path" }` or `{ "error", "hint" }`. |
+| GET | `/api/ace-step/models/status` | Download progress: `{ "running", "model", "progress", "error", "current_file", "file_index", "total_files", "eta_seconds", "cancelled" }`. |
+| POST | `/api/ace-step/models/download/cancel` | Request cancellation of the current download. Returns `{ "cancelled", "message" }`. |
 
-**Task types:** Generation accepts `taskType`: `text2music`, `cover`, `audio2audio`, `repaint`, `extend`, and (ACE-Step 1.5 Base) `lego`, `extract`, `complete`. Lego/extract/complete require the Base model and full 1.5 integration (planned).
+**Task → model:** Generation accepts `taskType`: `text2music`, `cover`, `audio2audio`, `repaint`, `extend`, and (Base-only) `lego`, `extract`, `complete`. **Lego**, **extract**, and **complete** require the **Base** DiT model: set `ace_step_dit_model` to `"base"` in preferences and ensure the Base model is installed (download via Settings or `POST /api/ace-step/models/download` with `"model": "base"`). The UI checks `GET /api/ace-step/models` for `dit_models[].installed` before allowing these tasks.
 
 ---
 

diff --git a/generate_ace.py b/generate_ace.py
@@ -592,15 +592,18 @@ def _prepare_reference_audio(
     if audio2audio_enable and task_norm == "text2music":
         task_norm = "retake"
 
-    # Any of the edit-style tasks imply some form of Audio2Audio.
+    # Any of the edit-style tasks imply some form of Audio2Audio or source-backed (lego/extract/complete).
     audio2audio_flag = bool(
         audio2audio_enable or task_norm in ("retake", "repaint", "extend")
     )
+    needs_src_path = audio2audio_flag or task_norm in ("lego", "extract", "complete")
 
-    # If we *think* we're in an edit / audio2audio mode but there's no
-    # reference audio path at all, don't crash — just fall back to
-    # plain text2music.
-    if audio2audio_flag and not src_audio_path:
+    # If we need source/reference audio but none was provided, fall back to text2music (or fail for lego/extract/complete).
+    if needs_src_path and not src_audio_path:
+        if task_norm in ("lego", "extract", "complete"):
+            raise ValueError(
+                f"Task '{task_norm}' requires backing/source audio. Please provide it in the Lego tab or Custom audio card."
+            )
         print(
             "[ACE] Audio2Audio / edit task requested but no reference audio "
             "was provided — falling back to plain text2music.",
@@ -612,6 +615,8 @@ def _prepare_reference_audio(
 
     if audio2audio_flag:
         ref_path = _ensure_reference_wav(src_audio_path)
+    elif task_norm in ("lego", "extract", "complete"):
+        ref_path = _ensure_reference_wav(src_audio_path)  # pipeline uses this as src_audio_path
     else:
         ref_path = None
 
@@ -978,10 +983,10 @@ def _run_ace_text2music(
 
         # Wire up reference vs source audio per ACE-Step pipeline:
         #
-        # - retake / cover / audio2audio: use ref_audio_input (pipeline sets task to
-        #   "audio2audio" and uses ref_latents). Do NOT pass src_audio_path.
-        # - repaint / extend: use src_audio_path (pipeline uses src_latents for the
-        #   segment to repaint or extend). Do NOT pass ref_audio_input for this path.
+        # - retake / cover / audio2audio / lego / extract / complete: use ref_audio_input so the pipeline
+        #   gets backing latents. For lego we use LOW ref_audio_strength (API default 0.3) so diffusion
+        #   starts from noisy backing and denoises toward the prompt (new instrument), matching timing.
+        # - repaint / extend: use src_audio_path (pipeline uses src_latents for repaint/extend segment).
         # - text2music: leave both unset (None).
         if not src_audio_path:
             call_kwargs["ref_audio_input"] = None
@@ -990,7 +995,7 @@ def _run_ace_text2music(
             call_kwargs["src_audio_path"] = src_audio_path
             call_kwargs["ref_audio_input"] = None
         else:
-            # retake (including cover/audio2audio from UI)
+            # retake, cover, audio2audio, lego, extract, complete: backing as ref (lego uses low ref_audio_strength)
             call_kwargs["ref_audio_input"] = src_audio_path
             call_kwargs["src_audio_path"] = None
 

diff --git a/ui/App.tsx b/ui/App.tsx
@@ -1101,6 +1101,7 @@ export default function App() {
                 onGenerate={handleGenerate}
                 isGenerating={isGenerating}
                 initialData={reuseData}
+                onOpenSettings={() => setShowSettingsModal(true)}
               />
             </div>