Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 63 additions & 17 deletions api/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,22 @@

import json
import logging
import re
import threading
import time
import uuid
from pathlib import Path
from flask import Blueprint, jsonify, request, send_file

def _uppercase_track_in_instruction(instruction):
"""Uppercase TRACK_NAME in 'Generate the X track ...' to match ACE-Step (cli.py _default_instruction_for_task)."""
if not instruction or " track " not in instruction:
return instruction
m = re.search(r"(\bthe\s+)(\w+)(\s+track\b)", instruction, re.IGNORECASE)
if m:
return instruction[: m.start(2)] + m.group(2).upper() + instruction[m.end(2) :]
return instruction

from cdmf_paths import get_output_dir, get_user_data_dir, load_config
from cdmf_tracks import list_lora_adapters, load_track_meta, save_track_meta
from cdmf_generation_job import GenerationCancelled
Expand Down Expand Up @@ -122,22 +132,32 @@ def _run_generation(job_id: str) -> None:
if task not in allowed_tasks:
task = "text2music"
# Single style/caption field drives all text conditioning (ACE-Step caption).
# Simple mode: songDescription. Advanced mode: style. Both can have key, time sig, vocal language.
prompt = (params.get("style") or "").strip() if custom_mode else (params.get("songDescription") or "").strip()
# Simple mode: songDescription. Advanced mode: style. Lego/extract/complete: instruction + caption only (no metas; source sets context).
if task in ("lego", "extract", "complete"):
instruction = (params.get("instruction") or "").strip()
caption = (params.get("style") or "").strip()
if not instruction and not caption:
instruction = "Generate an instrument track based on the audio context:"
prompt = None # built below after we have duration/bpm/metas
else:
instruction = None
caption = None
prompt = (params.get("style") or "").strip() if custom_mode else (params.get("songDescription") or "").strip()
key_scale = (params.get("keyScale") or "").strip()
time_sig = (params.get("timeSignature") or "").strip()
vocal_lang = (params.get("vocalLanguage") or "").strip().lower()
extra_bits = []
if key_scale:
extra_bits.append(f"key {key_scale}")
if time_sig:
extra_bits.append(f"time signature {time_sig}")
if vocal_lang and vocal_lang not in ("unknown", ""):
extra_bits.append(f"vocal language {vocal_lang}")
if extra_bits:
prompt = f"{prompt}, {', '.join(extra_bits)}" if prompt else ", ".join(extra_bits)
# When user explicitly chose English, reinforce in caption so model conditions on it
if vocal_lang == "en" and prompt:
if task != "lego":
if key_scale:
extra_bits.append(f"key {key_scale}")
if time_sig:
extra_bits.append(f"time signature {time_sig}")
if vocal_lang and vocal_lang not in ("unknown", ""):
extra_bits.append(f"vocal language {vocal_lang}")
if extra_bits:
prompt = f"{prompt}, {', '.join(extra_bits)}" if prompt else ", ".join(extra_bits)
# When user explicitly chose English, reinforce in caption so model conditions on it (skip for lego)
if task != "lego" and vocal_lang == "en" and prompt:
if not prompt.lower().startswith("english"):
prompt = f"English vocals, {prompt}"
if not prompt:
Expand Down Expand Up @@ -190,21 +210,32 @@ def _run_generation(job_id: str) -> None:
bpm = None
except (TypeError, ValueError):
bpm = None
# Lego/extract/complete: instruction (uppercase track) + caption only. No metas — BPM/key/timesignature
# should match the input backing; passing them would be for cover/target-style mode.
if task in ("lego", "extract", "complete"):
instruction = _uppercase_track_in_instruction(
instruction or "Generate an instrument track based on the audio context:"
)
prompt = (instruction + "\n\n" + (caption or "")).strip() or instruction
title = (params.get("title") or "Untitled").strip() or "Track"
reference_audio_url = (params.get("referenceAudioUrl") or params.get("reference_audio_path") or "").strip()
source_audio_url = (params.get("sourceAudioUrl") or params.get("src_audio_path") or "").strip()
# For cover/retake use source-first (song to cover); for style/reference use reference-first
if task in ("cover", "retake"):
# For cover/retake/lego use source-first (backing/song to cover); for style/reference use reference-first
if task in ("cover", "retake", "lego"):
resolved = _resolve_audio_url_to_path(source_audio_url) if source_audio_url else None
src_audio_path = resolved or (_resolve_audio_url_to_path(reference_audio_url) if reference_audio_url else None)
else:
resolved = _resolve_audio_url_to_path(reference_audio_url) if reference_audio_url else None
src_audio_path = resolved or (_resolve_audio_url_to_path(source_audio_url) if source_audio_url else None)

# When reference/source audio is provided, enable Audio2Audio so ACE-Step uses it (cover/retake/repaint).
# See docs/ACE-Step-INFERENCE.md: audio_cover_strength 1.0 = strong adherence; 0.5–0.8 = more caption influence.
# When reference/source audio is provided, enable Audio2Audio so ACE-Step uses it (cover/retake/repaint/lego).
# Lego/extract/complete: use ref_audio so the model gets backing as context; use LOW ref_audio_strength
# (e.g. 0.3) so diffusion starts from noisy backing and denoises toward the prompt (new instrument), not a copy.
# See docs/ACE-Step-INFERENCE.md: audio_cover_strength 1.0 = strong adherence; lower = more prompt influence.
audio2audio_enable = bool(src_audio_path)
ref_default = 0.8 if task in ("cover", "retake", "audio2audio") else 0.7
if task in ("lego", "extract", "complete"):
ref_default = 0.3 # low strength so output follows prompt (instrument) while matching backing timing
ref_audio_strength = float(params.get("audioCoverStrength") or params.get("ref_audio_strength") or ref_default)
ref_audio_strength = max(0.0, min(1.0, ref_audio_strength))

Expand All @@ -231,6 +262,10 @@ def _run_generation(job_id: str) -> None:
thinking = bool(params.get("thinking", False))
use_cot_metas = bool(params.get("useCotMetas", True))
use_cot_caption = bool(params.get("useCotCaption", True))
# Lego/extract/complete: instruction must stay verbatim ("Generate the X track based on the audio context:").
# LM refinement would rephrase and can drop the track-type instruction, so disable CoT caption for these tasks.
if task in ("lego", "extract", "complete"):
use_cot_caption = False
use_cot_language = bool(params.get("useCotLanguage", True))
try:
lm_temperature = float(params.get("lmTemperature") or params.get("lm_temperature") or 0.85)
Expand Down Expand Up @@ -403,7 +438,18 @@ def create_job():
data = raw if isinstance(raw, dict) else {}
logging.info("[API generate] Request body keys: %s", list(data.keys()) if data else [])

if not data.get("customMode") and not data.get("songDescription"):
task_for_validation = (data.get("taskType") or "text2music").strip().lower()
base_only_tasks = ("lego", "extract", "complete")
if task_for_validation in base_only_tasks:
# Lego/extract/complete: require source audio and caption/instruction (no songDescription)
src_audio = (data.get("sourceAudioUrl") or data.get("source_audio_path") or "").strip()
instruction = (data.get("instruction") or "").strip()
style = (data.get("style") or "").strip()
if not src_audio:
return jsonify({"error": "Backing/source audio required for Lego (and extract/complete)"}), 400
if not instruction and not style:
return jsonify({"error": "Describe the track (caption) or instruction required for Lego"}), 400
elif not data.get("customMode") and not data.get("songDescription"):
return jsonify({"error": "Song description required for simple mode"}), 400
# Custom mode: require at least one of style, lyrics, reference audio, or source audio
if data.get("customMode"):
Expand Down
19 changes: 17 additions & 2 deletions cdmf_pipeline_ace_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -2052,23 +2052,38 @@ def __call__(
preprocess_time_cost = end_time - start_time
start_time = end_time

# Lego/extract/complete: generate NEW track from prompt only; use source only for duration (no repaint/retake).
# Repaint/retake/extend: use src_latents in diffusion.
add_retake_noise = task in ("retake", "repaint", "extend")
# retake equal to repaint
if task == "retake":
repaint_start = 0
repaint_end = audio_duration
if task in ("lego", "extract", "complete"):
repaint_start = 0
repaint_end = audio_duration

src_latents = None
if src_audio_path is not None:
assert src_audio_path is not None and task in (
"repaint",
"edit",
"extend",
), "src_audio_path is required for retake/repaint/extend task"
"lego",
"extract",
"complete",
), "src_audio_path is required for repaint/extend/lego/extract/complete task"
assert os.path.exists(
src_audio_path
), f"src_audio_path {src_audio_path} does not exist"
src_latents = self.infer_latents(src_audio_path)
src_latents_inferred = self.infer_latents(src_audio_path)
if task in ("lego", "extract", "complete"):
# Use source only to set output duration; do not pass latents into diffusion (generate from scratch with prompt).
num_frames = src_latents_inferred.shape[-1]
audio_duration = num_frames * 512 * 8 / 44100.0
src_latents = None # no repaint for lego
else:
src_latents = src_latents_inferred

ref_latents = None
if ref_audio_input is not None and audio2audio_enable:
Expand Down
16 changes: 10 additions & 6 deletions docs/ACEFORGE_API.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ ACE-Step text-to-music (and related tasks). Jobs are queued and run one at a tim
- `inferenceSteps`: int (e.g. 55).
- `guidanceScale`: float (e.g. 6.0).
- `seed`: int; if `randomSeed` is true, server may override with random.
- `taskType`: `"text2music"` | `"retake"` | `"repaint"` | `"extend"` | `"cover"` | `"audio2audio"`.
- `referenceAudioUrl`, `sourceAudioUrl`: URLs like `/audio/refs/...` or `/audio/<filename>` for reference/cover.
- `taskType`: `"text2music"` | `"retake"` | `"repaint"` | `"extend"` | `"cover"` | `"audio2audio"` | `"lego"` | `"extract"` | `"complete"`. **Lego**, **extract**, and **complete** require the ACE-Step **Base** DiT model (see Preferences and ACE-Step models).
- `instruction`: optional; for `taskType` **lego** (and extract/complete), task-specific instruction (e.g. `"Generate the guitar track based on the audio context:"`). If omitted for lego, the server builds one from track name/caption.
- `referenceAudioUrl`, `sourceAudioUrl`: URLs like `/audio/refs/...` or `/audio/<filename>` for reference/cover. For **lego**, **extract**, and **complete**, **sourceAudioUrl** is the backing/source audio (required).
- `audioCoverStrength` / `ref_audio_strength`: 0–1.
- `repaintingStart`, `repaintingEnd`: for repaint task.
- `title`: base name for output file.
Expand All @@ -135,6 +136,8 @@ ACE-Step text-to-music (and related tasks). Jobs are queued and run one at a tim
- `loraNameOrPath`: optional; folder name from LoRA list or path to adapter (see `GET /api/generate/lora_adapters`).
- `loraWeight`: optional; 0–2, default 0.75.

**Base-only tasks (lego, extract, complete):** Require `ace_step_dit_model: "base"` in preferences and the Base model to be installed (Settings or `GET /api/ace-step/models`). For **lego**: send `taskType: "lego"`, `sourceAudioUrl` (backing audio), `instruction` (e.g. `"Generate the <track> track based on the audio context:"`), and `style` as the track description (caption). Supported track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, `keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. See `docs/ACE-Step-INFERENCE.md` for extract/complete parameters.

**Response (POST):** `{ "jobId": "<uuid>", "status": "queued", "queuePosition": 1 }`

**Status response:** `{ "jobId", "status": "queued"|"running"|"succeeded"|"failed"|"cancelled", "queuePosition"?, "etaSeconds"?, "result"?, "error"? }`. On success, `result` includes e.g. `audioUrls`, `duration`, `status`. Cancelled jobs have `status: "cancelled"` and `error: "Cancelled by user"`.
Expand Down Expand Up @@ -203,11 +206,12 @@ List available DiT/LM models and trigger downloads. **The ACE-Step 1.5 downloade

| Method | Path | Description |
|--------|------|-------------|
| GET | `/api/ace-step/models` | List DiT and LM models with `installed` status, plus `discovered_models`: all model directories found under the checkpoints folder (including custom trained models). Response includes `dit_models`, `lm_models`, `discovered_models` (id, label, path, custom), `acestep_download_available`, `checkpoints_path`. |
| POST | `/api/ace-step/models/download` | Start download. Body: `{ "model": "turbo" | "turbo-shift1" | "sft" | "base" | "0.6B" | "1.7B" | "4B" }`. Uses bundled downloader (or `acestep-download` on PATH if not bundled). Returns `{ "ok", "model", "path" }` or `{ "error", "hint" }`. |
| GET | `/api/ace-step/models/status` | Download progress: `{ "running", "model", "progress", "error" }`. |
| GET | `/api/ace-step/models` | List DiT and LM models with `installed` status, plus `discovered_models`: all model directories found under the checkpoints folder (including custom trained models). Response includes `dit_models`, `lm_models`, `discovered_models` (id, label, path, custom), `acestep_download_available`, `checkpoints_path`. Use this to verify the **Base** model is installed before starting a lego/extract/complete job. |
| POST | `/api/ace-step/models/download` | Start download. Body: `{ "model": "turbo" \| "turbo-shift1" \| "turbo-shift3" \| "turbo-continuous" \| "sft" \| "base" \| "0.6B" \| "1.7B" \| "4B" }`. Uses bundled downloader (or `acestep-download` on PATH if not bundled). Returns `{ "ok", "model", "path" }` or `{ "error", "hint" }`. |
| GET | `/api/ace-step/models/status` | Download progress: `{ "running", "model", "progress", "error", "current_file", "file_index", "total_files", "eta_seconds", "cancelled" }`. |
| POST | `/api/ace-step/models/download/cancel` | Request cancellation of the current download. Returns `{ "cancelled", "message" }`. |

**Task types:** Generation accepts `taskType`: `text2music`, `cover`, `audio2audio`, `repaint`, `extend`, and (ACE-Step 1.5 Base) `lego`, `extract`, `complete`. Lego/extract/complete require the Base model and full 1.5 integration (planned).
**Task → model:** Generation accepts `taskType`: `text2music`, `cover`, `audio2audio`, `repaint`, `extend`, and (Base-only) `lego`, `extract`, `complete`. **Lego**, **extract**, and **complete** require the **Base** DiT model: set `ace_step_dit_model` to `"base"` in preferences and ensure the Base model is installed (download via Settings or `POST /api/ace-step/models/download` with `"model": "base"`). The UI checks `GET /api/ace-step/models` for `dit_models[].installed` before allowing these tasks.

---

Expand Down
25 changes: 15 additions & 10 deletions generate_ace.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,15 +592,18 @@ def _prepare_reference_audio(
if audio2audio_enable and task_norm == "text2music":
task_norm = "retake"

# Any of the edit-style tasks imply some form of Audio2Audio.
# Any of the edit-style tasks imply some form of Audio2Audio or source-backed (lego/extract/complete).
audio2audio_flag = bool(
audio2audio_enable or task_norm in ("retake", "repaint", "extend")
)
needs_src_path = audio2audio_flag or task_norm in ("lego", "extract", "complete")

# If we *think* we're in an edit / audio2audio mode but there's no
# reference audio path at all, don't crash — just fall back to
# plain text2music.
if audio2audio_flag and not src_audio_path:
# If we need source/reference audio but none was provided, fall back to text2music (or fail for lego/extract/complete).
if needs_src_path and not src_audio_path:
if task_norm in ("lego", "extract", "complete"):
raise ValueError(
f"Task '{task_norm}' requires backing/source audio. Please provide it in the Lego tab or Custom audio card."
)
print(
"[ACE] Audio2Audio / edit task requested but no reference audio "
"was provided — falling back to plain text2music.",
Expand All @@ -612,6 +615,8 @@ def _prepare_reference_audio(

if audio2audio_flag:
ref_path = _ensure_reference_wav(src_audio_path)
elif task_norm in ("lego", "extract", "complete"):
ref_path = _ensure_reference_wav(src_audio_path) # pipeline uses this as src_audio_path
else:
ref_path = None

Expand Down Expand Up @@ -978,10 +983,10 @@ def _run_ace_text2music(

# Wire up reference vs source audio per ACE-Step pipeline:
#
# - retake / cover / audio2audio: use ref_audio_input (pipeline sets task to
# "audio2audio" and uses ref_latents). Do NOT pass src_audio_path.
# - repaint / extend: use src_audio_path (pipeline uses src_latents for the
# segment to repaint or extend). Do NOT pass ref_audio_input for this path.
# - retake / cover / audio2audio / lego / extract / complete: use ref_audio_input so the pipeline
# gets backing latents. For lego we use LOW ref_audio_strength (API default 0.3) so diffusion
# starts from noisy backing and denoises toward the prompt (new instrument), matching timing.
# - repaint / extend: use src_audio_path (pipeline uses src_latents for repaint/extend segment).
# - text2music: leave both unset (None).
if not src_audio_path:
call_kwargs["ref_audio_input"] = None
Expand All @@ -990,7 +995,7 @@ def _run_ace_text2music(
call_kwargs["src_audio_path"] = src_audio_path
call_kwargs["ref_audio_input"] = None
else:
# retake (including cover/audio2audio from UI)
# retake, cover, audio2audio, lego, extract, complete: backing as ref (lego uses low ref_audio_strength)
call_kwargs["ref_audio_input"] = src_audio_path
call_kwargs["src_audio_path"] = None

Expand Down
1 change: 1 addition & 0 deletions ui/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,7 @@ export default function App() {
onGenerate={handleGenerate}
isGenerating={isGenerating}
initialData={reuseData}
onOpenSettings={() => setShowSettingsModal(true)}
/>
</div>

Expand Down
Loading