ai-bot-pro · weedge · Sep 13, 2025 · Sep 7, 2025 · Sep 8, 2025 · Sep 9, 2025
diff --git a/.env.example b/.env.example
@@ -66,4 +66,14 @@ METERED_TURN_CREDENTIAL=
 
 # https://developers.cloudflare.com/realtime/turn/
 CLOUDFLARE_TURN_TOKEN=
-CLOUDFLARE_TURN_API_TOKEN=
+CLOUDFLARE_TURN_API_TOKEN=
+
+# d1
+CLOUDFLARE_ACCOUNT_ID=
+CLOUDFLARE_API_KEY=
+PODCAST_D1_DB_ID=
+# r2
+CLOUDFLARE_ACCESS_KEY=
+CLOUDFLARE_SECRET_KEY=
+CLOUDFLARE_REGION=apac
+S3_BUCKET_URL=
diff --git a/.gitmodules b/.gitmodules
@@ -78,3 +78,6 @@
 [submodule "deps/HiggsAudio"]
 	path = deps/HiggsAudio
 	url = https://github.com/weedge/higgs-audio.git
+[submodule "deps/StepAudio2"]
+	path = deps/StepAudio2
+	url = https://github.com/weedge/Step-Audio2.git
diff --git a/demo/cloudflare/rest_api.py b/demo/cloudflare/rest_api.py
@@ -42,7 +42,7 @@ def d1_table_query(db_id: str, sql: str, sql_params: List[str] = []) -> dict:
     data = res.read().decode("utf-8")
     # print(data)
     json_data = json.loads(data)
-    # logging.info(f"body:{body}, db_id:{db_id}, query res:{json_data}")
+    logging.debug(f"body:{body}, db_id:{db_id}, query res:{json_data}")
     return json_data
 
 
@@ -85,7 +85,7 @@ def d1_db(db_id: str) -> dict:
 """
 if __name__ == "__main__":
     logging.basicConfig(
-        level=logging.INFO,
+        level=logging.DEBUG,
         format="%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(funcName)s - %(message)s",
         handlers=[logging.StreamHandler()],
     )

diff --git a/demo/content_parser_tts.py b/demo/content_parser_tts.py
@@ -67,7 +67,15 @@ async def gen_podcast_tts_audios(
     podcast_index, role_index = (0, 0)
     pre_role = ""
     pre_cn, cur_cn = (0, 0)
+    title = ""
+    description = ""
     for extraction in data_models:
+        if title == "" and extraction.description:
+            title = extraction.title
+            print(f"title:{title}\n")
+        if description == "" and extraction.roles:
+            description = extraction.description
+            print(f"description:{description}\n")
         if not extraction.roles:
             continue
         p_save_dir = os.path.join(save_dir, str(podcast_index))
@@ -111,6 +119,7 @@ async def gen_podcast_tts_audios(
             role_index += 1
             await edge_tts_conversion(role.content, output_file, voice)
 
+    # print(extraction)
     return extraction
 
 

diff --git a/demo/gen_podcast.py b/demo/gen_podcast.py
@@ -64,6 +64,7 @@ def run(
         save_dir=save_dir,
     )
     for data in data_list:
+        print(data)
         source = data[0]
         extraction: podcast.Podcast = data[1]
         audio_output_file = data[2]

diff --git a/demo/insert_podcast.py b/demo/insert_podcast.py
@@ -126,6 +126,7 @@ def get_podcast(
                         raise  # 如果达到最大重试次数，抛出异常
 
         gen_img_prompt = f"podcast cover image which content is about {en_title}"
+        print(f"{gen_img_prompt}")
         img_file = save_gen_image(gen_img_prompt, uuid.uuid4().hex)
         cover_img_url = r2_upload("podcast", img_file)
 
@@ -201,11 +202,20 @@ def insert_podcast_to_d1(
         formatted_time,
         podcast.audio_size,
     ]
+    # ====================
+    # debug_sql = sql.replace("?", "{}").format(
+    #    *[f"'{p}'" if isinstance(p, str) else str(p) for p in sql_params]
+    # )
+    # print(f"Debug SQL: {debug_sql}")
+    # ====================
+
     res = d1_table_query(db_id, sql, sql_params)
     if res["success"] is True:
         logging.info(
             f"insert podcast success, url: https://podcast-997.pages.dev/podcast/{podcast.pid}"
         )
+    else:
+        logging.error(f"insert podcast failed, res: {res}")
     return res["success"]
 
 
@@ -224,6 +234,7 @@ def update_podcast_cover_to_d1(
         pid,
     ]
     res = d1_table_query(db_id, sql, sql_params)
+    print(res)
     return res["success"]
 
 

diff --git a/deploy/modal/src/fastapi_webrtc_step2_voice_bot_serve.py b/deploy/modal/src/fastapi_webrtc_step2_voice_bot_serve.py
@@ -0,0 +1,157 @@
+import os
+
+import modal
+
+
+achatbot_version = os.getenv("ACHATBOT_VERSION", "0.0.25")
+app = modal.App("step-audio2-voice-bot")
+# fastapi_webrtc_bots | fastapi_webrtc_single_bot server
+SERVER_TAG = os.getenv("SERVER_TAG", "fastapi_webrtc_bots")
+IMAGE_GPU = os.getenv("IMAGE_GPU", "L4")
+img = (
+    # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
+    modal.Image.from_registry(
+        "nvcr.io/nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04",
+        add_python="3.10",
+    )
+    .apt_install("git", "git-lfs", "ffmpeg")
+    .pip_install(
+        [
+            "achatbot["
+            "fastapi_bot_server,"
+            "livekit,livekit-api,daily,agora,"
+            "silero_vad_analyzer,"
+            "sense_voice_asr,deepgram_asr_processor,"
+            "tts_edge,"
+            "queue"
+            f"]~={achatbot_version}",
+        ],
+        extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"),
+    )
+    .pip_install(
+        "transformers==4.49.0",
+        "torchaudio",
+        "librosa",
+        "onnxruntime",
+        "s3tokenizer",
+        "diffusers",
+        "hyperpyyaml",
+        "huggingface_hub",
+    )
+    .env(
+        {
+            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+            "ACHATBOT_PKG": "1",
+            "SERVER_TAG": SERVER_TAG,
+            "CONFIG_FILE": os.getenv(
+                "CONFIG_FILE",
+                "/root/.achatbot/config/bots/daily_step_audio2_aqaa_bot.json",
+            ),
+        }
+    )
+)
+
+# img = img.pip_install(
+#    f"achatbot==0.0.25.dev57",
+#    extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"),
+# )
+
+
+HF_MODEL_DIR = "/root/.achatbot/models"
+hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)
+ASSETS_DIR = "/root/.achatbot/assets"
+assets_vol = modal.Volume.from_name("assets", create_if_missing=True)
+CONFIG_DIR = "/root/.achatbot/config"
+config_vol = modal.Volume.from_name("config", create_if_missing=True)
+RECORDS_DIR = "/root/.achatbot/records"
+records_vol = modal.Volume.from_name("records", create_if_missing=True)
+
+TORCH_CACHE_DIR = "/root/.cache/torch"
+torch_cache_vol = modal.Volume.from_name("torch_cache", create_if_missing=True)
+
+
+# 128 MiB of memory and 0.125 CPU cores by default container runtime
+@app.cls(
+    image=img,
+    gpu=os.getenv("IMAGE_GPU", None),
+    secrets=[modal.Secret.from_name("achatbot")],
+    volumes={
+        HF_MODEL_DIR: hf_model_vol,
+        ASSETS_DIR: assets_vol,
+        CONFIG_DIR: config_vol,
+        RECORDS_DIR: records_vol,
+        TORCH_CACHE_DIR: torch_cache_vol,
+    },
+    cpu=2.0,
+    timeout=1200,  # default 300s
+    scaledown_window=1200,
+    max_containers=1,
+    # allow_concurrent_inputs=int(os.getenv("IMAGE_CONCURRENT_CN", "1")),
+)
+@modal.concurrent(max_inputs=int(os.getenv("IMAGE_CONCURRENT_CN", "1")))  # inputs per container
+class Srv:
+    @modal.enter()
+    def enter(self):
+        # run container runtime to enter when container is starting
+        import subprocess
+        import torch
+
+        subprocess.run("nvidia-smi --version", shell=True)
+        gpu_prop = None
+        if torch.cuda.is_available():
+            gpu_prop = torch.cuda.get_device_properties("cuda:0")
+            print(gpu_prop)
+            torch.multiprocessing.set_start_method("spawn", force=True)
+        else:
+            print("CUDA is not available.")
+
+    @modal.asgi_app()
+    def app(self):
+        SERVER_TAG = os.getenv("SERVER_TAG", "fastapi_webrtc_bots")
+        if SERVER_TAG == "fastapi_webrtc_single_bot":
+            from achatbot.cmd.http.server.fastapi_room_bot_serve import app as fastapi_app
+
+            print("run fastapi_room_bot_serve(single bot)")
+        else:
+            from achatbot.cmd.http.server.fastapi_daily_bot_serve import app as fastapi_app
+
+            print("run fastapi_daily_bot_serve(multi bots)")
+
+        return fastapi_app
+
+
+"""
+# 0. download models and assets
+modal run src/download_models.py --repo-ids "stepfun-ai/Step-Audio-2-mini"
+modal run src/download_assets.py --asset-urls "https://raw.githubusercontent.com/stepfun-ai/Step-Audio2/refs/heads/main/assets/default_male.wav"
+modal run src/download_assets.py --asset-urls "https://raw.githubusercontent.com/stepfun-ai/Step-Audio2/refs/heads/main/assets/default_female.wav"
+
+# 1. run webrtc room http bots server
+
+IMAGE_GPU=L4 SERVER_TAG=fastapi_webrtc_bots \
+    ACHATBOT_VERSION=0.0.25 \
+    modal serve src/fastapi_webrtc_step2_voice_bot_serve.py
+
+# 2. run webrtc room http signal bot server
+
+modal volume create config
+
+modal volume put config ./config/bots/daily_step_audio2_aqaa_bot.json /bots/ -f
+modal volume put config ./config/bots/daily_step_audio2_aqaa_tools_bot.json /bots/ -f
+
+# run container with gpu
+IMAGE_GPU=L4 SERVER_TAG=fastapi_webrtc_single_bot \
+    ACHATBOT_VERSION=0.0.25 \
+    CONFIG_FILE=/root/.achatbot/config/bots/daily_step_audio2_aqaa_bot.json \
+    modal serve src/fastapi_webrtc_step2_voice_bot_serve.py
+IMAGE_GPU=L4 SERVER_TAG=fastapi_webrtc_single_bot \
+    ACHATBOT_VERSION=0.0.25 \
+    CONFIG_FILE=/root/.achatbot/config/bots/daily_step_audio2_aqaa_tools_bot.json \
+    modal serve src/fastapi_webrtc_step2_voice_bot_serve.py
+
+# cold start fastapi webrtc http server
+curl -v -XGET "https://weedge--step-audio2-voice-bot-srv-app-dev.modal.run/health"
+
+# run bot
+curl -XPOST "https://weedge--step-audio2-voice-bot-srv-app-dev.modal.run/bot_join/chat-room/DailyStepAudio2AQAABot"
+"""