Ace-Step-v1.5

Sleeping

App Files Files Community

ChuxiJ commited on 21 days ago

Commit

1e0d19a

1 Parent(s): 54f2bd3

add docs and serve mode

Browse files

Files changed (17) hide show

.env.example +2 -2
.gitignore +2 -1
acestep/acestep_v15_pipeline.py +49 -0
acestep/gradio_ui/events/results_handlers.py +19 -8
acestep/gradio_ui/interfaces/__init__.py +2 -1
acestep/gradio_ui/interfaces/generation.py +30 -21
acestep/gradio_ui/interfaces/training.py +6 -2
acestep/inference.py +1 -1
API.md → docs/en/API.md +349 -54
docs/en/GRADIO_GUIDE.md +551 -0
INFERENCE.md → docs/en/INFERENCE.md +386 -70
docs/ja/API.md +570 -0
docs/ja/GRADIO_GUIDE.md +551 -0
docs/ja/INFERENCE.md +739 -0
docs/zh/API.md +570 -0
docs/zh/GRADIO_GUIDE.md +551 -0
docs/zh/INFERENCE.md +1049 -0

.env.example CHANGED Viewed

@@ -1,4 +1,4 @@
-ACESTEP_CONFIG_PATH=acestep-v15-turbo-rl
-ACESTEP_LM_MODEL_PATH=acestep-5Hz-lm-0.6B-v3
 ACESTEP_DEVICE=auto
 ACESTEP_LM_BACKEND=vllm

+ACESTEP_CONFIG_PATH=acestep-v15-turbo
+ACESTEP_LM_MODEL_PATH=acestep-5Hz-lm-1.7B
 ACESTEP_DEVICE=auto
 ACESTEP_LM_BACKEND=vllm

.gitignore CHANGED Viewed

@@ -224,4 +224,5 @@ scripts/
 checkpoints_legacy/
 lora_output/
 datasets/
-python_embeded/

 checkpoints_legacy/
 lora_output/
 datasets/
+python_embeded/
+checkpoints_pack/

acestep/acestep_v15_pipeline.py CHANGED Viewed

@@ -5,6 +5,27 @@ Handler wrapper connecting model and UI
 import os
 import sys
 # Clear proxy settings that may affect Gradio
 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
@@ -101,6 +122,10 @@ def main():
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
     parser.add_argument("--language", type=str, default="en", choices=["en", "zh", "ja"], help="UI language: en (English), zh (中文), ja (日本語)")
     # Service initialization arguments
     parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
     parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
@@ -115,6 +140,29 @@ def main():
     args = parser.parse_args()
     try:
         init_params = None
@@ -198,6 +246,7 @@ def main():
             # Prepare initialization parameters for UI
             init_params = {
                 'pre_initialized': True,
                 'checkpoint': args.checkpoint,
                 'config_path': args.config_path,
                 'device': args.device,

 import os
 import sys
+# Load environment variables from .env file in project root
+# This allows configuration without hardcoding values
+# Falls back to .env.example if .env is not found
+try:
+    from dotenv import load_dotenv
+    # Get project root directory
+    _current_file = os.path.abspath(__file__)
+    _project_root = os.path.dirname(os.path.dirname(_current_file))
+    _env_path = os.path.join(_project_root, '.env')
+    _env_example_path = os.path.join(_project_root, '.env.example')
+    if os.path.exists(_env_path):
+        load_dotenv(_env_path)
+        print(f"Loaded configuration from {_env_path}")
+    elif os.path.exists(_env_example_path):
+        load_dotenv(_env_example_path)
+        print(f"Loaded configuration from {_env_example_path} (fallback)")
+except ImportError:
+    # python-dotenv not installed, skip loading .env
+    pass
 # Clear proxy settings that may affect Gradio
 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
     parser.add_argument("--language", type=str, default="en", choices=["en", "zh", "ja"], help="UI language: en (English), zh (中文), ja (日本語)")
+    # Service mode argument
+    parser.add_argument("--service_mode", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False,
+                       help="Enable service mode (default: False). When enabled, uses preset models and restricts UI options.")
     # Service initialization arguments
     parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
     parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
     args = parser.parse_args()
+    # Service mode defaults (can be configured via .env file)
+    if args.service_mode:
+        print("Service mode enabled - applying preset configurations...")
+        # Force init_service in service mode
+        args.init_service = True
+        # Default DiT model for service mode (from env or fallback)
+        if args.config_path is None:
+            args.config_path = os.environ.get(
+                "SERVICE_MODE_DIT_MODEL",
+                "acestep-v15-turbo-fix-inst-shift-dynamic"
+            )
+        # Default LM model for service mode (from env or fallback)
+        if args.lm_model_path is None:
+            args.lm_model_path = os.environ.get(
+                "SERVICE_MODE_LM_MODEL",
+                "acestep-5Hz-lm-1.7B-v4-fix"
+            )
+        # Backend for service mode (from env or fallback to vllm)
+        args.backend = os.environ.get("SERVICE_MODE_BACKEND", "vllm")
+        print(f"  DiT model: {args.config_path}")
+        print(f"  LM model: {args.lm_model_path}")
+        print(f"  Backend: {args.backend}")
     try:
         init_params = None
             # Prepare initialization parameters for UI
             init_params = {
                 'pre_initialized': True,
+                'service_mode': args.service_mode,
                 'checkpoint': args.checkpoint,
                 'config_path': args.config_path,
                 'device': args.device,

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -266,7 +266,18 @@ def _build_generation_info(
     """
     info_parts = []
-    # Part 1: LM-generated metadata (if available)
     if lm_metadata:
         metadata_lines = []
         if lm_metadata.get('bpm'):
@@ -288,7 +299,7 @@ def _build_generation_info(
             metadata_section = "**🤖 LM-Generated Metadata:**\n" + "\n".join(metadata_lines)
             info_parts.append(metadata_section)
-    # Part 2: Time costs (formatted and beautified)
     if time_costs:
         time_lines = []
@@ -337,16 +348,11 @@ def _build_generation_info(
             if auto_lrc_time > 0:
                 time_lines.append(f"  - Auto LRC: {auto_lrc_time:.2f}s")
-        # Pipeline total
-        pipeline_total = time_costs.get('pipeline_total_time', 0.0)
-        if pipeline_total > 0:
-            time_lines.append(f"\n**⏱️ Pipeline Total: {pipeline_total:.2f}s**")
         if time_lines:
             time_section = "\n".join(time_lines)
             info_parts.append(time_section)
-    # Part 3: Generation summary
     summary_lines = [
         "**🎵 Generation Complete**",
         f"  - **Seeds:** {seed_value}",
@@ -355,6 +361,11 @@ def _build_generation_info(
     ]
     info_parts.append("\n".join(summary_lines))
     # Combine all parts
     return "\n\n".join(info_parts)

     """
     info_parts = []
+    # Part 1: Per-track average time (prominently displayed at the top)
+    # Only count model time (LM + DiT), not post-processing like audio conversion
+    if time_costs and num_audios > 0:
+        lm_total = time_costs.get('lm_total_time', 0.0)
+        dit_total = time_costs.get('dit_total_time_cost', 0.0)
+        model_total = lm_total + dit_total
+        if model_total > 0:
+            avg_time_per_track = model_total / num_audios
+            avg_section = f"**🎯 Average Time per Track: {avg_time_per_track:.2f}s** ({num_audios} track(s))"
+            info_parts.append(avg_section)
+    # Part 2: LM-generated metadata (if available)
     if lm_metadata:
         metadata_lines = []
         if lm_metadata.get('bpm'):
             metadata_section = "**🤖 LM-Generated Metadata:**\n" + "\n".join(metadata_lines)
             info_parts.append(metadata_section)
+    # Part 3: Time costs breakdown (formatted and beautified)
     if time_costs:
         time_lines = []
             if auto_lrc_time > 0:
                 time_lines.append(f"  - Auto LRC: {auto_lrc_time:.2f}s")
         if time_lines:
             time_section = "\n".join(time_lines)
             info_parts.append(time_section)
+    # Part 4: Generation summary
     summary_lines = [
         "**🎵 Generation Complete**",
         f"  - **Seeds:** {seed_value}",
     ]
     info_parts.append("\n".join(summary_lines))
+    # Part 5: Pipeline total time (at the end)
+    pipeline_total = time_costs.get('pipeline_total_time', 0.0) if time_costs else 0.0
+    if pipeline_total > 0:
+        info_parts.append(f"**⏱️ Total Time: {pipeline_total:.2f}s**")
     # Combine all parts
     return "\n\n".join(info_parts)

acestep/gradio_ui/interfaces/__init__.py CHANGED Viewed

@@ -78,7 +78,8 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_para
         results_section = create_results_section(dit_handler)
         # Training Section (LoRA training and dataset builder)
-        training_section = create_training_section(dit_handler, llm_handler)
         # Connect event handlers
         setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)

         results_section = create_results_section(dit_handler)
         # Training Section (LoRA training and dataset builder)
+        # Pass init_params to support hiding in service mode
+        training_section = create_training_section(dit_handler, llm_handler, init_params=init_params)
         # Connect event handlers
         setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -26,6 +26,9 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
     # Check if service is pre-initialized
     service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
     # Get current language from init_params if available
     current_language = init_params.get('language', language) if init_params else language
@@ -175,9 +178,11 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
             with gr.Column(scale=2):
                 with gr.Accordion(t("generation.required_inputs"), open=True):
                     # Task type
-                    # Determine initial task_type choices based on default model
-                    default_model_lower = (default_model or "").lower()
-                    if "turbo" in default_model_lower:
                         initial_task_choices = TASK_TYPES_TURBO
                     else:
                         initial_task_choices = TASK_TYPES_BASE
@@ -277,19 +282,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                             )
                     # Simple/Custom Mode Toggle
-                    with gr.Row():
                         generation_mode = gr.Radio(
                             choices=[
                                 (t("generation.mode_simple"), "simple"),
                                 (t("generation.mode_custom"), "custom"),
                             ],
-                            value="simple",
                             label=t("generation.mode_label"),
                             info=t("generation.mode_info"),
                         )
-                    # Simple Mode Components - visible only in Simple mode
-                    with gr.Group(visible=True) as simple_mode_group:
                         with gr.Row(equal_height=True):
                             simple_query_input = gr.Textbox(
                                 label=t("generation.simple_query_label"),
@@ -332,7 +338,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     simple_sample_created = gr.State(value=False)
                 # Music Caption - wrapped in accordion that can be collapsed in Simple mode
-                with gr.Accordion(t("generation.caption_title"), open=False) as caption_accordion:
                     with gr.Row(equal_height=True):
                         captions = gr.Textbox(
                             label=t("generation.caption_label"),
@@ -349,7 +356,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                                 scale=2,
                             )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
-                with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
                         label=t("generation.lyrics_label"),
                         placeholder=t("generation.lyrics_placeholder"),
@@ -388,7 +396,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                         )
                 # Optional Parameters
-                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
                     with gr.Row():
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
@@ -423,7 +432,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                             minimum=1,
                             maximum=8,
                             step=1,
-                            info=t("generation.batch_size_info")
                         )
         # Advanced Settings
@@ -463,7 +473,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     choices=["mp3", "flac"],
                     value="mp3",
                     label=t("generation.audio_format_label"),
-                    info=t("generation.audio_format_info")
                 )
             with gr.Row():
@@ -583,6 +594,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     value=False,
                     info=t("generation.constrained_debug_info"),
                     scale=1,
                 )
             with gr.Row():
@@ -591,12 +603,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     value=False,
                     info=t("generation.auto_score_info"),
                     scale=1,
                 )
                 auto_lrc = gr.Checkbox(
                     label=t("generation.auto_lrc_label"),
                     value=False,
                     info=t("generation.auto_lrc_info"),
                     scale=1,
                 )
                 lm_batch_chunk_size = gr.Number(
                     label=t("generation.lm_batch_chunk_label"),
@@ -606,6 +620,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     step=1,
                     info=t("generation.lm_batch_chunk_info"),
                     scale=1,
                 )
             with gr.Row():
@@ -626,13 +641,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     label=t("generation.score_sensitivity_label"),
                     info=t("generation.score_sensitivity_info"),
                     scale=1,
-                )
-                output_alignment_preference = gr.Checkbox(
-                    label=t("generation.attention_focus_label"),
-                    value=False,
-                    info=t("generation.attention_focus_info"),
-                    interactive=False,
-                    scale=1,
                 )
         # Set generate_btn to interactive if service is pre-initialized
@@ -654,8 +663,9 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
             with gr.Column(scale=1, variant="compact"):
                 autogen_checkbox = gr.Checkbox(
                     label=t("generation.autogen_label"),
-                    value=True,
                     scale=1,
                 )
                 use_cot_caption = gr.Checkbox(
                     label=t("generation.caption_rewrite_label"),
@@ -741,7 +751,6 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "infer_method": infer_method,
         "custom_timesteps": custom_timesteps,
         "audio_format": audio_format,
-        "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,

     # Check if service is pre-initialized
     service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
+    # Check if running in service mode (restricted UI)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
     # Get current language from init_params if available
     current_language = init_params.get('language', language) if init_params else language
             with gr.Column(scale=2):
                 with gr.Accordion(t("generation.required_inputs"), open=True):
                     # Task type
+                    # Determine initial task_type choices based on actual model in use
+                    # When service is pre-initialized, use config_path from init_params
+                    actual_model = init_params.get('config_path', default_model) if service_pre_initialized else default_model
+                    actual_model_lower = (actual_model or "").lower()
+                    if "turbo" in actual_model_lower:
                         initial_task_choices = TASK_TYPES_TURBO
                     else:
                         initial_task_choices = TASK_TYPES_BASE
                             )
                     # Simple/Custom Mode Toggle
+                    # In service mode: only Custom mode, hide the toggle
+                    with gr.Row(visible=not service_mode):
                         generation_mode = gr.Radio(
                             choices=[
                                 (t("generation.mode_simple"), "simple"),
                                 (t("generation.mode_custom"), "custom"),
                             ],
+                            value="custom" if service_mode else "simple",
                             label=t("generation.mode_label"),
                             info=t("generation.mode_info"),
                         )
+                    # Simple Mode Components - hidden in service mode
+                    with gr.Group(visible=not service_mode) as simple_mode_group:
                         with gr.Row(equal_height=True):
                             simple_query_input = gr.Textbox(
                                 label=t("generation.simple_query_label"),
                     simple_sample_created = gr.State(value=False)
                 # Music Caption - wrapped in accordion that can be collapsed in Simple mode
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.caption_title"), open=service_mode) as caption_accordion:
                     with gr.Row(equal_height=True):
                         captions = gr.Textbox(
                             label=t("generation.caption_label"),
                                 scale=2,
                             )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.lyrics_title"), open=service_mode) as lyrics_accordion:
                     lyrics = gr.Textbox(
                         label=t("generation.lyrics_label"),
                         placeholder=t("generation.lyrics_placeholder"),
                         )
                 # Optional Parameters
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.optional_params"), open=service_mode) as optional_params_accordion:
                     with gr.Row():
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
                             minimum=1,
                             maximum=8,
                             step=1,
+                            info=t("generation.batch_size_info"),
+                            interactive=not service_mode  # Fixed in service mode
                         )
         # Advanced Settings
                     choices=["mp3", "flac"],
                     value="mp3",
                     label=t("generation.audio_format_label"),
+                    info=t("generation.audio_format_info"),
+                    interactive=not service_mode  # Fixed in service mode
                 )
             with gr.Row():
                     value=False,
                     info=t("generation.constrained_debug_info"),
                     scale=1,
+                    interactive=not service_mode  # Fixed in service mode
                 )
             with gr.Row():
                     value=False,
                     info=t("generation.auto_score_info"),
                     scale=1,
+                    interactive=not service_mode  # Fixed in service mode
                 )
                 auto_lrc = gr.Checkbox(
                     label=t("generation.auto_lrc_label"),
                     value=False,
                     info=t("generation.auto_lrc_info"),
                     scale=1,
+                    interactive=not service_mode  # Fixed in service mode
                 )
                 lm_batch_chunk_size = gr.Number(
                     label=t("generation.lm_batch_chunk_label"),
                     step=1,
                     info=t("generation.lm_batch_chunk_info"),
                     scale=1,
+                    interactive=not service_mode  # Fixed in service mode
                 )
             with gr.Row():
                     label=t("generation.score_sensitivity_label"),
                     info=t("generation.score_sensitivity_info"),
                     scale=1,
+                    visible=not service_mode  # Hidden in service mode
                 )
         # Set generate_btn to interactive if service is pre-initialized
             with gr.Column(scale=1, variant="compact"):
                 autogen_checkbox = gr.Checkbox(
                     label=t("generation.autogen_label"),
+                    value=False,  # Default to False for both service and local modes
                     scale=1,
+                    interactive=not service_mode  # Not selectable in service mode
                 )
                 use_cot_caption = gr.Checkbox(
                     label=t("generation.caption_rewrite_label"),
         "infer_method": infer_method,
         "custom_timesteps": custom_timesteps,
         "audio_format": audio_format,
         "think_checkbox": think_checkbox,
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,

acestep/gradio_ui/interfaces/training.py CHANGED Viewed

@@ -9,18 +9,22 @@ import gradio as gr
 from acestep.gradio_ui.i18n import t
-def create_training_section(dit_handler, llm_handler) -> dict:
     """Create the training tab section with dataset builder and training controls.
     Args:
         dit_handler: DiT handler instance
         llm_handler: LLM handler instance
     Returns:
         Dictionary of Gradio components for event handling
     """
-    with gr.Tab("🎓 LoRA Training"):
         gr.HTML("""
         <div style="text-align: center; padding: 10px; margin-bottom: 15px;">
             <h2>🎵 LoRA Training for ACE-Step</h2>

 from acestep.gradio_ui.i18n import t
+def create_training_section(dit_handler, llm_handler, init_params=None) -> dict:
     """Create the training tab section with dataset builder and training controls.
     Args:
         dit_handler: DiT handler instance
         llm_handler: LLM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
     Returns:
         Dictionary of Gradio components for event handling
     """
+    # Check if running in service mode (hide training tab)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
+    with gr.Tab("🎓 LoRA Training", visible=not service_mode):
         gr.HTML("""
         <div style="text-align: center; padding: 10px; margin-bottom: 15px;">
             <h2>🎵 LoRA Training for ACE-Step</h2>

acestep/inference.py CHANGED Viewed

@@ -399,7 +399,7 @@ def generate_music(
             # Determine infer_type based on whether we need audio codes
             # - "llm_dit": generates both metas and audio codes (two-phase internally)
             # - "dit": generates only metas (single phase)
-            infer_type = "llm_dit" if need_audio_codes else "dit"
             # Use chunk size from config, or default to batch_size if not set
             max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size

             # Determine infer_type based on whether we need audio codes
             # - "llm_dit": generates both metas and audio codes (two-phase internally)
             # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes and params.thinking else "dit"
             # Use chunk size from config, or default to batch_size if not set
             max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size

API.md → docs/en/API.md RENAMED Viewed

@@ -1,10 +1,28 @@
 # ACE-Step API Client Documentation
 This service provides an HTTP-based asynchronous music generation API.
 **Basic Workflow**:
 1. Call `POST /v1/music/generate` to submit a task and obtain a `job_id`.
 2. Call `GET /v1/jobs/{job_id}` to poll the task status until `status` is `succeeded` or `failed`.
 ---
@@ -25,10 +43,21 @@ Task status (`status`) includes the following types:
 - **URL**: `/v1/music/generate`
 - **Method**: `POST`
-- **Content-Type**: `application/json` or `multipart/form-data`
 ### 2.2 Request Parameters
 #### Method A: JSON Request (application/json)
 Suitable for passing only text parameters, or referencing audio file paths that already exist on the server.
@@ -43,6 +72,20 @@ Suitable for passing only text parameters, or referencing audio file paths that
 | `vocal_language` | string | `"en"` | Lyrics language (en, zh, ja, etc.) |
 | `audio_format` | string | `"mp3"` | Output format (mp3, wav, flac) |
 **thinking Semantics (Important)**:
 - `thinking=false`:
@@ -50,11 +93,11 @@ Suitable for passing only text parameters, or referencing audio file paths that
   - DiT runs in **text2music** mode and **ignores** any provided `audio_code_string`.
 - `thinking=true`:
   - The server will use 5Hz LM to generate `audio_code_string` (lm-dit behavior).
-  - DiT runs in **cover** mode and uses `audio_code_string`.
-**Metadata Auto-Completion (Always On)**:
-Regardless of `thinking`, if any of the following fields are missing, the server may call 5Hz LM to **fill only the missing fields** based on `caption`/`lyrics`:
 - `bpm`
 - `key_scale`
@@ -67,26 +110,37 @@ User-provided values always win; LM only fills the fields that are empty/missing
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
-| `bpm` | int | null | Specify tempo (BPM) |
-| `key_scale` | string | `""` | Key/scale (e.g., "C Major") |
-| `time_signature` | string | `""` | Time signature (e.g., "4/4") |
-| `audio_duration` | float | null | Generation duration (seconds) |
 **Audio Codes (Optional)**:
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
-| `audio_code_string` | string or string[] | `""` | Audio semantic tokens (5Hz) for `llm_dit`. If provided as an array, it should match `batch_size` (or the server batch size). |
 **Generation Control Parameters**:
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
-| `inference_steps` | int | `8` | Number of inference steps |
-| `guidance_scale` | float | `7.0` | Prompt guidance coefficient |
 | `use_random_seed` | bool | `true` | Whether to use random seed |
 | `seed` | int | `-1` | Specify seed (when use_random_seed=false) |
-| `batch_size` | int | null | Batch generation count |
 **5Hz LM Parameters (Optional, server-side)**:
@@ -94,26 +148,35 @@ These parameters control 5Hz LM sampling, used for metadata auto-completion and
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
-| `lm_model_path` | string | null | 5Hz LM checkpoint dir name (e.g. `acestep-5Hz-lm-0.6B`) |
 | `lm_backend` | string | `"vllm"` | `vllm` or `pt` |
 | `lm_temperature` | float | `0.85` | Sampling temperature |
-| `lm_cfg_scale` | float | `2.0` | CFG scale (>1 enables CFG) |
 | `lm_negative_prompt` | string | `"NO USER INPUT"` | Negative prompt used by CFG |
 | `lm_top_k` | int | null | Top-k (0/null disables) |
 | `lm_top_p` | float | `0.9` | Top-p (>=1 will be treated as disabled) |
 | `lm_repetition_penalty` | float | `1.0` | Repetition penalty |
 **Edit/Reference Audio Parameters** (requires absolute path on server):
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
 | `reference_audio_path` | string | null | Reference audio path (Style Transfer) |
 | `src_audio_path` | string | null | Source audio path (Repainting/Cover) |
-| `task_type` | string | `"text2music"` | Task type (text2music, cover, repaint) |
-| `instruction` | string | `"Fill..."` | Edit instruction |
-| `repainting_start` | float | `0.0` | Repainting start time |
-| `repainting_end` | float | null | Repainting end time |
-| `audio_cover_strength` | float | `1.0` | Cover strength |
 #### Method B: File Upload (multipart/form-data)
@@ -138,7 +201,7 @@ In addition to supporting all the above fields as Form Fields, the following fil
 ### 2.4 Usage Examples (cURL)
-**JSON Method**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
@@ -146,11 +209,11 @@ curl -X POST http://localhost:8001/v1/music/generate \
   -d '{
     "caption": "upbeat pop song",
     "lyrics": "Hello world",
-    "inference_steps": 16
   }'
 ```
-**JSON Method (thinking=true: generate codes + fill missing metas)**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
@@ -160,47 +223,69 @@ curl -X POST http://localhost:8001/v1/music/generate \
     "lyrics": "Hello world",
     "thinking": true,
     "lm_temperature": 0.85,
-    "lm_cfg_scale": 2.0,
-    "lm_top_k": null,
-    "lm_top_p": 0.9,
-    "lm_repetition_penalty": 1.0
   }'
 ```
-**JSON Method (thinking=false: do NOT generate codes, but fill missing metas)**:
-Example: user specifies `bpm` but omits `audio_duration`. The server may call LM to infer `duration` from `caption`/`lyrics` and use it only if the user did not set it.
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
   -H 'Content-Type: application/json' \
   -d '{
-    "caption": "slow emotional ballad",
-    "lyrics": "...",
-    "thinking": false,
-    "bpm": 72
   }'
 ```
-When the server invokes the 5Hz LM (to fill metas and/or generate codes), the job `result` may include the following optional fields:
-- `bpm`
-- `duration`
-- `genres`
-- `keyscale`
-- `timesignature`
-- `metas` (raw-ish metadata dict)
-> Note: If you use `curl -d` but **forget** to add `-H 'Content-Type: application/json'`, curl will default to sending `application/x-www-form-urlencoded`, and older server versions will return 415.
-**Form Method (no file upload, application/x-www-form-urlencoded)**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
-  -H 'Content-Type: application/x-www-form-urlencoded' \
-  --data-urlencode 'caption=upbeat pop song' \
-  --data-urlencode 'lyrics=Hello world' \
-  --data-urlencode 'inference_steps=16'
 ```
 **File Upload Method**:
@@ -230,11 +315,22 @@ The response contains basic task information, queue status, and final results.
 - `status`: Current status
 - `queue_position`: Current queue position (0 means running or completed)
 - `eta_seconds`: Estimated remaining wait time (seconds)
 - `result`: Result object when successful
-  - `audio_paths`: List of generated audio file URLs/paths
-  - `first_audio_path`: Preferred audio path
   - `generation_info`: Generation parameter details
   - `status_message`: Brief result description
 - `error`: Error information when failed
 ### 3.3 Response Examples
@@ -243,11 +339,12 @@ The response contains basic task information, queue status, and final results.
 ```json
 {
-  "job_id": "...",
   "status": "queued",
   "created_at": 1700000000.0,
   "queue_position": 5,
   "eta_seconds": 25.0,
   "result": null,
   "error": null
 }
@@ -257,19 +354,217 @@ The response contains basic task information, queue status, and final results.
 ```json
 {
-  "job_id": "...",
   "status": "succeeded",
   "created_at": 1700000000.0,
   "finished_at": 1700000010.0,
   "queue_position": 0,
   "result": {
-    "first_audio_path": "/tmp/generated_1.mp3",
-    "second_audio_path": "/tmp/generated_2.mp3",
-    "audio_paths": ["/tmp/generated_1.mp3", "/tmp/generated_2.mp3"],
-    "generation_info": "Steps: 8, Scale: 7.0 ...",
     "status_message": "✅ Generation completed successfully!",
-    "seed_value": "12345"
   },
   "error": null
 }
 ```

 # ACE-Step API Client Documentation
+**Language / 语言 / 言語:** [English](API.md) | [中文](../zh/API.md) | [日本語](../ja/API.md)
+---
 This service provides an HTTP-based asynchronous music generation API.
 **Basic Workflow**:
 1. Call `POST /v1/music/generate` to submit a task and obtain a `job_id`.
 2. Call `GET /v1/jobs/{job_id}` to poll the task status until `status` is `succeeded` or `failed`.
+3. Download audio files via `GET /v1/audio?path=...` URLs returned in the result.
+---
+## Table of Contents
+- [Task Status Description](#1-task-status-description)
+- [Create Generation Task](#2-create-generation-task)
+- [Query Task Results](#3-query-task-results)
+- [Random Sample Generation](#4-random-sample-generation)
+- [List Available Models](#5-list-available-models)
+- [Download Audio Files](#6-download-audio-files)
+- [Health Check](#7-health-check)
+- [Environment Variables](#8-environment-variables)
 ---
 - **URL**: `/v1/music/generate`
 - **Method**: `POST`
+- **Content-Type**: `application/json`, `multipart/form-data`, or `application/x-www-form-urlencoded`
 ### 2.2 Request Parameters
+#### Parameter Naming Convention
+The API supports both **snake_case** and **camelCase** naming for most parameters. For example:
+- `audio_duration` / `duration` / `audioDuration`
+- `key_scale` / `keyscale` / `keyScale`
+- `time_signature` / `timesignature` / `timeSignature`
+- `sample_query` / `sampleQuery` / `description` / `desc`
+- `use_format` / `useFormat` / `format`
+Additionally, metadata can be passed in a nested object (`metas`, `metadata`, or `user_metadata`).
 #### Method A: JSON Request (application/json)
 Suitable for passing only text parameters, or referencing audio file paths that already exist on the server.
 | `vocal_language` | string | `"en"` | Lyrics language (en, zh, ja, etc.) |
 | `audio_format` | string | `"mp3"` | Output format (mp3, wav, flac) |
+**Sample/Description Mode Parameters**:
+| Parameter Name | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `sample_mode` | bool | `false` | Enable random sample generation mode (auto-generates caption/lyrics/metas via LM). |
+| `sample_query` | string | `""` | Natural language description for sample generation (e.g., "a soft Bengali love song"). Aliases: `description`, `desc`. |
+| `use_format` | bool | `false` | Use LM to enhance/format the provided caption and lyrics. Alias: `format`. |
+**Multi-Model Support**:
+| Parameter Name | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `model` | string | null | Select which DiT model to use (e.g., `"acestep-v15-turbo"`, `"acestep-v15-turbo-rl"`). Use `/v1/models` to list available models. If not specified, uses the default model. |
 **thinking Semantics (Important)**:
 - `thinking=false`:
   - DiT runs in **text2music** mode and **ignores** any provided `audio_code_string`.
 - `thinking=true`:
   - The server will use 5Hz LM to generate `audio_code_string` (lm-dit behavior).
+  - DiT runs with LM-generated codes for enhanced music quality.
+**Metadata Auto-Completion (Conditional)**:
+When `use_cot_caption=true` or `use_cot_language=true` or metadata fields are missing, the server may call 5Hz LM to fill the missing fields based on `caption`/`lyrics`:
 - `bpm`
 - `key_scale`
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
+| `bpm` | int | null | Specify tempo (BPM), range 30-300 |
+| `key_scale` | string | `""` | Key/scale (e.g., "C Major", "Am"). Aliases: `keyscale`, `keyScale` |
+| `time_signature` | string | `""` | Time signature (2, 3, 4, 6 for 2/4, 3/4, 4/4, 6/8). Aliases: `timesignature`, `timeSignature` |
+| `audio_duration` | float | null | Generation duration (seconds), range 10-600. Aliases: `duration`, `target_duration` |
 **Audio Codes (Optional)**:
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
+| `audio_code_string` | string or string[] | `""` | Audio semantic tokens (5Hz) for `llm_dit`. Alias: `audioCodeString` |
 **Generation Control Parameters**:
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
+| `inference_steps` | int | `8` | Number of inference steps. Turbo model: 1-20 (recommended 8). Base model: 1-200 (recommended 32-64). |
+| `guidance_scale` | float | `7.0` | Prompt guidance coefficient. Only effective for base model. |
 | `use_random_seed` | bool | `true` | Whether to use random seed |
 | `seed` | int | `-1` | Specify seed (when use_random_seed=false) |
+| `batch_size` | int | `2` | Batch generation count (max 8) |
+**Advanced DiT Parameters**:
+| Parameter Name | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `shift` | float | `3.0` | Timestep shift factor (range 1.0-5.0). Only effective for base models, not turbo models. |
+| `infer_method` | string | `"ode"` | Diffusion inference method: `"ode"` (Euler, faster) or `"sde"` (stochastic). |
+| `timesteps` | string | null | Custom timesteps as comma-separated values (e.g., `"0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0"`). Overrides `inference_steps` and `shift`. |
+| `use_adg` | bool | `false` | Use Adaptive Dual Guidance (base model only) |
+| `cfg_interval_start` | float | `0.0` | CFG application start ratio (0.0-1.0) |
+| `cfg_interval_end` | float | `1.0` | CFG application end ratio (0.0-1.0) |
 **5Hz LM Parameters (Optional, server-side)**:
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
+| `lm_model_path` | string | null | 5Hz LM checkpoint dir name (e.g. `acestep-5Hz-lm-0.6B-v3`) |
 | `lm_backend` | string | `"vllm"` | `vllm` or `pt` |
 | `lm_temperature` | float | `0.85` | Sampling temperature |
+| `lm_cfg_scale` | float | `2.5` | CFG scale (>1 enables CFG) |
 | `lm_negative_prompt` | string | `"NO USER INPUT"` | Negative prompt used by CFG |
 | `lm_top_k` | int | null | Top-k (0/null disables) |
 | `lm_top_p` | float | `0.9` | Top-p (>=1 will be treated as disabled) |
 | `lm_repetition_penalty` | float | `1.0` | Repetition penalty |
+**LM CoT (Chain-of-Thought) Parameters**:
+| Parameter Name | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `use_cot_caption` | bool | `true` | Let LM rewrite/enhance the input caption via CoT reasoning. Aliases: `cot_caption`, `cot-caption` |
+| `use_cot_language` | bool | `true` | Let LM detect vocal language via CoT. Aliases: `cot_language`, `cot-language` |
+| `constrained_decoding` | bool | `true` | Enable FSM-based constrained decoding for structured LM output. Aliases: `constrainedDecoding`, `constrained` |
+| `constrained_decoding_debug` | bool | `false` | Enable debug logging for constrained decoding |
 **Edit/Reference Audio Parameters** (requires absolute path on server):
 | Parameter Name | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
 | `reference_audio_path` | string | null | Reference audio path (Style Transfer) |
 | `src_audio_path` | string | null | Source audio path (Repainting/Cover) |
+| `task_type` | string | `"text2music"` | Task type: `text2music`, `cover`, `repaint`, `lego`, `extract`, `complete` |
+| `instruction` | string | auto | Edit instruction (auto-generated based on task_type if not provided) |
+| `repainting_start` | float | `0.0` | Repainting start time (seconds) |
+| `repainting_end` | float | null | Repainting end time (seconds), -1 for end of audio |
+| `audio_cover_strength` | float | `1.0` | Cover strength (0.0-1.0). Lower values (0.2) for style transfer. |
 #### Method B: File Upload (multipart/form-data)
 ### 2.4 Usage Examples (cURL)
+**Basic JSON Method**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
   -d '{
     "caption": "upbeat pop song",
     "lyrics": "Hello world",
+    "inference_steps": 8
   }'
 ```
+**With thinking=true (LM generates codes + fills missing metas)**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
     "lyrics": "Hello world",
     "thinking": true,
     "lm_temperature": 0.85,
+    "lm_cfg_scale": 2.5
   }'
 ```
+**Description-driven generation (sample_query)**:
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "sample_query": "a soft Bengali love song for a quiet evening",
+    "thinking": true
+  }'
+```
+**With format enhancement (use_format=true)**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
   -H 'Content-Type: application/json' \
   -d '{
+    "caption": "pop rock",
+    "lyrics": "[Verse 1]\nWalking down the street...",
+    "use_format": true,
+    "thinking": true
   }'
 ```
+**Select specific model**:
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "electronic dance music",
+    "model": "acestep-v15-turbo-rl",
+    "thinking": true
+  }'
+```
+**With custom timesteps**:
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "jazz piano trio",
+    "timesteps": "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+    "thinking": true
+  }'
+```
+**With thinking=false (DiT only, but fill missing metas)**:
 ```bash
 curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "slow emotional ballad",
+    "lyrics": "...",
+    "thinking": false,
+    "bpm": 72
+  }'
 ```
 **File Upload Method**:
 - `status`: Current status
 - `queue_position`: Current queue position (0 means running or completed)
 - `eta_seconds`: Estimated remaining wait time (seconds)
+- `avg_job_seconds`: Average job duration (for ETA estimation)
 - `result`: Result object when successful
+  - `audio_paths`: List of generated audio file URLs (use with `/v1/audio` endpoint)
+  - `first_audio_path`: First audio path (URL)
+  - `second_audio_path`: Second audio path (URL, if batch_size >= 2)
   - `generation_info`: Generation parameter details
   - `status_message`: Brief result description
+  - `seed_value`: Comma-separated seed values used
+  - `metas`: Complete metadata dict
+  - `bpm`: Detected/used BPM
+  - `duration`: Detected/used duration
+  - `keyscale`: Detected/used key scale
+  - `timesignature`: Detected/used time signature
+  - `genres`: Detected genres (if available)
+  - `lm_model`: Name of the LM model used
+  - `dit_model`: Name of the DiT model used
 - `error`: Error information when failed
 ### 3.3 Response Examples
 ```json
 {
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
   "status": "queued",
   "created_at": 1700000000.0,
   "queue_position": 5,
   "eta_seconds": 25.0,
+  "avg_job_seconds": 5.0,
   "result": null,
   "error": null
 }
 ```json
 {
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
   "status": "succeeded",
   "created_at": 1700000000.0,
+  "started_at": 1700000001.0,
   "finished_at": 1700000010.0,
   "queue_position": 0,
   "result": {
+    "first_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+    "second_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3",
+    "audio_paths": [
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3"
+    ],
+    "generation_info": "🎵 Generated 2 audios\n⏱️ Total: 8.5s\n🎲 Seeds: 12345,67890",
     "status_message": "✅ Generation completed successfully!",
+    "seed_value": "12345,67890",
+    "metas": {
+      "bpm": 120,
+      "duration": 30,
+      "keyscale": "C Major",
+      "timesignature": "4",
+      "caption": "upbeat pop song with catchy melody"
+    },
+    "bpm": 120,
+    "duration": 30,
+    "keyscale": "C Major",
+    "timesignature": "4",
+    "genres": null,
+    "lm_model": "acestep-5Hz-lm-0.6B-v3",
+    "dit_model": "acestep-v15-turbo-rl"
   },
   "error": null
 }
 ```
+---
+## 4. Random Sample Generation
+### 4.1 API Definition
+- **URL**: `/v1/music/random`
+- **Method**: `POST`
+This endpoint creates a sample-mode job that auto-generates caption, lyrics, and metadata via the 5Hz LM.
+### 4.2 Request Parameters
+| Parameter Name | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `thinking` | bool | `true` | Whether to also generate audio codes via LM |
+### 4.3 Response Example
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "queue_position": 1
+}
+```
+### 4.4 Usage Example
+```bash
+curl -X POST http://localhost:8001/v1/music/random \
+  -H 'Content-Type: application/json' \
+  -d '{"thinking": true}'
+```
+---
+## 5. List Available Models
+### 5.1 API Definition
+- **URL**: `/v1/models`
+- **Method**: `GET`
+Returns a list of available DiT models loaded on the server.
+### 5.2 Response Example
+```json
+{
+  "models": [
+    {
+      "name": "acestep-v15-turbo-rl",
+      "is_default": true
+    },
+    {
+      "name": "acestep-v15-turbo",
+      "is_default": false
+    }
+  ],
+  "default_model": "acestep-v15-turbo-rl"
+}
+```
+### 5.3 Usage Example
+```bash
+curl http://localhost:8001/v1/models
+```
+---
+## 6. Download Audio Files
+### 6.1 API Definition
+- **URL**: `/v1/audio`
+- **Method**: `GET`
+Download generated audio files by path.
+### 6.2 Request Parameters
+| Parameter Name | Type | Description |
+| :--- | :--- | :--- |
+| `path` | string | URL-encoded path to the audio file |
+### 6.3 Usage Example
+```bash
+# Download using the URL from job result
+curl "http://localhost:8001/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3" -o output.mp3
+```
+---
+## 7. Health Check
+### 7.1 API Definition
+- **URL**: `/health`
+- **Method**: `GET`
+Returns service health status.
+### 7.2 Response Example
+```json
+{
+  "status": "ok",
+  "service": "ACE-Step API",
+  "version": "1.0"
+}
+```
+---
+## 8. Environment Variables
+The API server can be configured using environment variables:
+| Variable | Default | Description |
+| :--- | :--- | :--- |
+| `ACESTEP_API_HOST` | `127.0.0.1` | Server bind host |
+| `ACESTEP_API_PORT` | `8001` | Server bind port |
+| `ACESTEP_CONFIG_PATH` | `acestep-v15-turbo-rl` | Primary DiT model path |
+| `ACESTEP_CONFIG_PATH2` | (empty) | Secondary DiT model path (optional) |
+| `ACESTEP_CONFIG_PATH3` | (empty) | Third DiT model path (optional) |
+| `ACESTEP_DEVICE` | `auto` | Device for model loading |
+| `ACESTEP_USE_FLASH_ATTENTION` | `true` | Enable flash attention |
+| `ACESTEP_OFFLOAD_TO_CPU` | `false` | Offload models to CPU when idle |
+| `ACESTEP_OFFLOAD_DIT_TO_CPU` | `false` | Offload DiT specifically to CPU |
+| `ACESTEP_LM_MODEL_PATH` | `acestep-5Hz-lm-0.6B-v3` | Default 5Hz LM model |
+| `ACESTEP_LM_BACKEND` | `vllm` | LM backend (vllm or pt) |
+| `ACESTEP_LM_DEVICE` | (same as ACESTEP_DEVICE) | Device for LM |
+| `ACESTEP_LM_OFFLOAD_TO_CPU` | `false` | Offload LM to CPU |
+| `ACESTEP_QUEUE_MAXSIZE` | `200` | Maximum queue size |
+| `ACESTEP_QUEUE_WORKERS` | `1` | Number of queue workers |
+| `ACESTEP_AVG_JOB_SECONDS` | `5.0` | Initial average job duration estimate |
+| `ACESTEP_TMPDIR` | `.cache/acestep/tmp` | Temporary directory for files |
+---
+## Error Handling
+**HTTP Status Codes**:
+- `200`: Success
+- `400`: Invalid request (bad JSON, missing fields)
+- `404`: Job not found
+- `415`: Unsupported Content-Type
+- `429`: Server busy (queue is full)
+- `500`: Internal server error
+**Error Response Format**:
+```json
+{
+  "detail": "Error message describing the issue"
+}
+```
+---
+## Best Practices
+1. **Use `thinking=true`** for best quality results with LM-enhanced generation.
+2. **Use `sample_query`/`description`** for quick generation from natural language descriptions.
+3. **Use `use_format=true`** when you have caption/lyrics but want LM to enhance them.
+4. **Poll job status** with reasonable intervals (e.g., every 1-2 seconds) to avoid overloading the server.
+5. **Check `avg_job_seconds`** in the response to estimate wait times.
+6. **Use multi-model support** by setting `ACESTEP_CONFIG_PATH2` and `ACESTEP_CONFIG_PATH3` environment variables, then select with the `model` parameter.
+7. **For production**, always set proper Content-Type headers to avoid 415 errors.

docs/en/GRADIO_GUIDE.md ADDED Viewed

	@@ -0,0 +1,551 @@

+# ACE-Step Gradio Demo User Guide
+**Language / 语言 / 言語:** [English](GRADIO_GUIDE.md) | [中文](../zh/GRADIO_GUIDE.md) | [日本語](../ja/GRADIO_GUIDE.md)
+---
+This guide provides comprehensive documentation for using the ACE-Step Gradio web interface for music generation, including all features and settings.
+## Table of Contents
+- [Getting Started](#getting-started)
+- [Service Configuration](#service-configuration)
+- [Generation Modes](#generation-modes)
+- [Task Types](#task-types)
+- [Input Parameters](#input-parameters)
+- [Advanced Settings](#advanced-settings)
+- [Results Section](#results-section)
+- [LoRA Training](#lora-training)
+- [Tips and Best Practices](#tips-and-best-practices)
+---
+## Getting Started
+### Launching the Demo
+```bash
+# Basic launch
+python app.py
+# With pre-initialization
+python app.py --config acestep-v15-turbo-rl --init-llm
+# With specific port
+python app.py --port 7860
+```
+### Interface Overview
+The Gradio interface consists of several main sections:
+1. **Service Configuration** - Model loading and initialization
+2. **Required Inputs** - Task type, audio uploads, and generation mode
+3. **Music Caption & Lyrics** - Text inputs for generation
+4. **Optional Parameters** - Metadata like BPM, key, duration
+5. **Advanced Settings** - Fine-grained control over generation
+6. **Results** - Generated audio playback and management
+---
+## Service Configuration
+### Model Selection
+| Setting | Description |
+|---------|-------------|
+| **Checkpoint File** | Select a trained model checkpoint (if available) |
+| **Main Model Path** | Choose the DiT model configuration (e.g., `acestep-v15-turbo`, `acestep-v15-turbo-rl`) |
+| **Device** | Processing device: `auto` (recommended), `cuda`, or `cpu` |
+### 5Hz LM Configuration
+| Setting | Description |
+|---------|-------------|
+| **5Hz LM Model Path** | Select the language model (e.g., `acestep-5Hz-lm-0.6B`, `acestep-5Hz-lm-0.6B-v3`) |
+| **5Hz LM Backend** | `vllm` (faster, recommended) or `pt` (PyTorch, more compatible) |
+| **Initialize 5Hz LM** | Check to load the LM during initialization (required for thinking mode) |
+### Performance Options
+| Setting | Description |
+|---------|-------------|
+| **Use Flash Attention** | Enable for faster inference (requires flash_attn package) |
+| **Offload to CPU** | Offload models to CPU when idle to save GPU memory |
+| **Offload DiT to CPU** | Specifically offload the DiT model to CPU |
+### LoRA Adapter
+| Setting | Description |
+|---------|-------------|
+| **LoRA Path** | Path to trained LoRA adapter directory |
+| **Load LoRA** | Load the specified LoRA adapter |
+| **Unload** | Remove the currently loaded LoRA |
+| **Use LoRA** | Enable/disable the loaded LoRA for inference |
+### Initialization
+Click **Initialize Service** to load the models. The status box will show progress and confirmation.
+---
+## Generation Modes
+### Simple Mode
+Simple mode is designed for quick, natural language-based music generation.
+**How to use:**
+1. Select "Simple" in the Generation Mode radio button
+2. Enter a natural language description in the "Song Description" field
+3. Optionally check "Instrumental" if you don't want vocals
+4. Optionally select a preferred vocal language
+5. Click **Create Sample** to generate caption, lyrics, and metadata
+6. Review the generated content in the expanded sections
+7. Click **Generate Music** to create the audio
+**Example descriptions:**
+- "a soft Bengali love song for a quiet evening"
+- "upbeat electronic dance music with heavy bass drops"
+- "melancholic indie folk with acoustic guitar"
+- "jazz trio playing in a smoky bar"
+**Random Sample:** Click the 🎲 button to load a random example description.
+### Custom Mode
+Custom mode provides full control over all generation parameters.
+**How to use:**
+1. Select "Custom" in the Generation Mode radio button
+2. Manually fill in the Caption and Lyrics fields
+3. Set optional metadata (BPM, Key, Duration, etc.)
+4. Optionally click **Format** to enhance your input using the LM
+5. Configure advanced settings as needed
+6. Click **Generate Music** to create the audio
+---
+## Task Types
+### text2music (Default)
+Generate music from text descriptions and/or lyrics.
+**Use case:** Creating new music from scratch based on prompts.
+**Required inputs:** Caption or Lyrics (at least one)
+### cover
+Transform existing audio while maintaining structure but changing style.
+**Use case:** Creating cover versions in different styles.
+**Required inputs:**
+- Source Audio (upload in Audio Uploads section)
+- Caption describing the target style
+**Key parameter:** `Audio Cover Strength` (0.0-1.0)
+- Higher values maintain more of the original structure
+- Lower values allow more creative freedom
+### repaint
+Regenerate a specific time segment of audio.
+**Use case:** Fixing or modifying specific sections of generated music.
+**Required inputs:**
+- Source Audio
+- Repainting Start (seconds)
+- Repainting End (seconds, -1 for end of file)
+- Caption describing the desired content
+### lego (Base Model Only)
+Generate a specific instrument track in context of existing audio.
+**Use case:** Adding instrument layers to backing tracks.
+**Required inputs:**
+- Source Audio
+- Track Name (select from dropdown)
+- Caption describing the track characteristics
+**Available tracks:** vocals, backing_vocals, drums, bass, guitar, keyboard, percussion, strings, synth, fx, brass, woodwinds
+### extract (Base Model Only)
+Extract/isolate a specific instrument track from mixed audio.
+**Use case:** Stem separation, isolating instruments.
+**Required inputs:**
+- Source Audio
+- Track Name to extract
+### complete (Base Model Only)
+Complete partial tracks with specified instruments.
+**Use case:** Auto-arranging incomplete compositions.
+**Required inputs:**
+- Source Audio
+- Track Names (multiple selection)
+- Caption describing the desired style
+---
+## Input Parameters
+### Required Inputs
+#### Task Type
+Select the generation task from the dropdown. The instruction field updates automatically based on the selected task.
+#### Audio Uploads
+| Field | Description |
+|-------|-------------|
+| **Reference Audio** | Optional audio for style reference |
+| **Source Audio** | Required for cover, repaint, lego, extract, complete tasks |
+| **Convert to Codes** | Extract 5Hz semantic codes from source audio |
+#### LM Codes Hints
+Pre-computed audio semantic codes can be pasted here to guide generation. Use the **Transcribe** button to analyze codes and extract metadata.
+### Music Caption
+The text description of the desired music. Be specific about:
+- Genre and style
+- Instruments
+- Mood and atmosphere
+- Tempo feel (if not specifying BPM)
+**Example:** "upbeat pop rock with electric guitars, driving drums, and catchy synth hooks"
+Click 🎲 to load a random example caption.
+### Lyrics
+Enter lyrics with structure tags:
+```
+[Verse 1]
+Walking down the street today
+Thinking of the words you used to say
+[Chorus]
+I'm moving on, I'm staying strong
+This is where I belong
+[Verse 2]
+...
+```
+**Instrumental checkbox:** Check this to generate instrumental music regardless of lyrics content.
+**Vocal Language:** Select the language for vocals. Use "unknown" for auto-detection or instrumental tracks.
+**Format button:** Click to enhance caption and lyrics using the 5Hz LM.
+### Optional Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| **BPM** | Auto | Tempo in beats per minute (30-300) |
+| **Key Scale** | Auto | Musical key (e.g., "C Major", "Am", "F# minor") |
+| **Time Signature** | Auto | Time signature: 2 (2/4), 3 (3/4), 4 (4/4), 6 (6/8) |
+| **Audio Duration** | Auto/-1 | Target length in seconds (10-600). -1 for automatic |
+| **Batch Size** | 2 | Number of audio variations to generate (1-8) |
+---
+## Advanced Settings
+### DiT Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| **Inference Steps** | 8 | Denoising steps. Turbo: 1-20, Base: 1-200 |
+| **Guidance Scale** | 7.0 | CFG strength (base model only). Higher = follows prompt more |
+| **Seed** | -1 | Random seed. Use comma-separated values for batches |
+| **Random Seed** | ✓ | When checked, generates random seeds |
+| **Audio Format** | mp3 | Output format: mp3, flac |
+| **Shift** | 3.0 | Timestep shift factor (1.0-5.0). Recommended 3.0 for turbo |
+| **Inference Method** | ode | ode (Euler, faster) or sde (stochastic) |
+| **Custom Timesteps** | - | Override timesteps (e.g., "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0") |
+### Base Model Only Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| **Use ADG** | ✗ | Enable Adaptive Dual Guidance for better quality |
+| **CFG Interval Start** | 0.0 | When to start applying CFG (0.0-1.0) |
+| **CFG Interval End** | 1.0 | When to stop applying CFG (0.0-1.0) |
+### LM Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| **LM Temperature** | 0.85 | Sampling temperature (0.0-2.0). Higher = more creative |
+| **LM CFG Scale** | 2.0 | LM guidance strength (1.0-3.0) |
+| **LM Top-K** | 0 | Top-K sampling. 0 disables |
+| **LM Top-P** | 0.9 | Nucleus sampling (0.0-1.0) |
+| **LM Negative Prompt** | "NO USER INPUT" | Negative prompt for CFG |
+### CoT (Chain-of-Thought) Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| **CoT Metas** | ✓ | Generate metadata via LM reasoning |
+| **CoT Language** | ✓ | Detect vocal language via LM |
+| **Constrained Decoding Debug** | ✗ | Enable debug logging |
+### Generation Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| **LM Codes Strength** | 1.0 | How strongly LM codes influence generation (0.0-1.0) |
+| **Auto Score** | ✗ | Automatically calculate quality scores |
+| **Auto LRC** | ✗ | Automatically generate lyrics timestamps |
+| **LM Batch Chunk Size** | 8 | Max items per LM batch (GPU memory) |
+### Main Generation Controls
+| Control | Description |
+|---------|-------------|
+| **Think** | Enable 5Hz LM for code generation and metadata |
+| **ParallelThinking** | Enable parallel LM batch processing |
+| **CaptionRewrite** | Let LM enhance the input caption |
+| **AutoGen** | Automatically start next batch after completion |
+---
+## Results Section
+### Generated Audio
+Up to 8 audio samples are displayed based on batch size. Each sample includes:
+- **Audio Player** - Play, pause, and download the generated audio
+- **Send To Src** - Send this audio to the Source Audio input for further processing
+- **Save** - Save audio and metadata to a JSON file
+- **Score** - Calculate perplexity-based quality score
+- **LRC** - Generate lyrics timestamps (LRC format)
+### Details Accordion
+Click "Score & LRC & LM Codes" to expand and view:
+- **LM Codes** - The 5Hz semantic codes for this sample
+- **Quality Score** - Perplexity-based quality metric
+- **Lyrics Timestamps** - LRC format timing data
+### Batch Navigation
+| Control | Description |
+|---------|-------------|
+| **◀ Previous** | View the previous batch |
+| **Batch Indicator** | Shows current batch position (e.g., "Batch 1 / 3") |
+| **Next Batch Status** | Shows background generation progress |
+| **Next ▶** | View the next batch (triggers generation if AutoGen is on) |
+### Restore Parameters
+Click **Apply These Settings to UI** to restore all generation parameters from the current batch back to the input fields. Useful for iterating on a good result.
+### Batch Results
+The "Batch Results & Generation Details" accordion contains:
+- **All Generated Files** - Download all files from all batches
+- **Generation Details** - Detailed information about the generation process
+---
+## LoRA Training
+The LoRA Training tab provides tools for creating custom LoRA adapters.
+### Dataset Builder Tab
+#### Step 1: Load or Scan
+**Option A: Load Existing Dataset**
+1. Enter the path to a previously saved dataset JSON
+2. Click **Load**
+**Option B: Scan New Directory**
+1. Enter the path to your audio folder
+2. Click **Scan** to find audio files (wav, mp3, flac, ogg, opus)
+#### Step 2: Configure Dataset
+| Setting | Description |
+|---------|-------------|
+| **Dataset Name** | Name for your dataset |
+| **All Instrumental** | Check if all tracks have no vocals |
+| **Custom Activation Tag** | Unique tag to activate this LoRA's style |
+| **Tag Position** | Where to place the tag: Prepend, Append, or Replace caption |
+#### Step 3: Auto-Label
+Click **Auto-Label All** to generate metadata for all audio files:
+- Caption (music description)
+- BPM
+- Key
+- Time Signature
+**Skip Metas** option will skip LLM labeling and use N/A values.
+#### Step 4: Preview & Edit
+Use the slider to select samples and manually edit:
+- Caption
+- Lyrics
+- BPM, Key, Time Signature
+- Language
+- Instrumental flag
+Click **Save Changes** to update the sample.
+#### Step 5: Save Dataset
+Enter a save path and click **Save Dataset** to export as JSON.
+#### Step 6: Preprocess
+Convert the dataset to pre-computed tensors for fast training:
+1. Optionally load an existing dataset JSON
+2. Set the tensor output directory
+3. Click **Preprocess**
+This encodes audio to VAE latents, text to embeddings, and runs the condition encoder.
+### Train LoRA Tab
+#### Dataset Selection
+Enter the path to preprocessed tensors directory and click **Load Dataset**.
+#### LoRA Settings
+| Setting | Default | Description |
+|---------|---------|-------------|
+| **LoRA Rank (r)** | 64 | Capacity of LoRA. Higher = more capacity, more memory |
+| **LoRA Alpha** | 128 | Scaling factor (typically 2x rank) |
+| **LoRA Dropout** | 0.1 | Dropout rate for regularization |
+#### Training Parameters
+| Setting | Default | Description |
+|---------|---------|-------------|
+| **Learning Rate** | 1e-4 | Optimization learning rate |
+| **Max Epochs** | 500 | Maximum training epochs |
+| **Batch Size** | 1 | Training batch size |
+| **Gradient Accumulation** | 1 | Effective batch = batch_size × accumulation |
+| **Save Every N Epochs** | 200 | Checkpoint save frequency |
+| **Shift** | 3.0 | Timestep shift for turbo model |
+| **Seed** | 42 | Random seed for reproducibility |
+#### Training Controls
+- **Start Training** - Begin the training process
+- **Stop Training** - Interrupt training
+- **Training Progress** - Shows current epoch and loss
+- **Training Log** - Detailed training output
+- **Training Loss Plot** - Visual loss curve
+#### Export LoRA
+After training, export the final adapter:
+1. Enter the export path
+2. Click **Export LoRA**
+---
+## Tips and Best Practices
+### For Best Quality
+1. **Use thinking mode** - Keep "Think" checkbox enabled for LM-enhanced generation
+2. **Be specific in captions** - Include genre, instruments, mood, and style details
+3. **Let LM detect metadata** - Leave BPM/Key/Duration empty for auto-detection
+4. **Use batch generation** - Generate 2-4 variations and pick the best
+### For Faster Generation
+1. **Use turbo model** - Select `acestep-v15-turbo` or `acestep-v15-turbo-rl`
+2. **Keep inference steps at 8** - Default is optimal for turbo
+3. **Reduce batch size** - Lower batch size if you need quick results
+4. **Disable AutoGen** - Manual control over batch generation
+### For Consistent Results
+1. **Set a specific seed** - Uncheck "Random Seed" and enter a seed value
+2. **Save good results** - Use "Save" to export parameters for reproduction
+3. **Use "Apply These Settings"** - Restore parameters from a good batch
+### For Long-form Music
+1. **Set explicit duration** - Specify duration in seconds
+2. **Use repaint task** - Fix problematic sections after initial generation
+3. **Chain generations** - Use "Send To Src" to build upon previous results
+### For Style Consistency
+1. **Train a LoRA** - Create a custom adapter for your style
+2. **Use reference audio** - Upload style reference in Audio Uploads
+3. **Use consistent captions** - Maintain similar descriptive language
+### Troubleshooting
+**No audio generated:**
+- Check that the model is initialized (green status message)
+- Ensure 5Hz LM is initialized if using thinking mode
+- Check the status output for error messages
+**Poor quality results:**
+- Increase inference steps (for base model)
+- Adjust guidance scale
+- Try different seeds
+- Make caption more specific
+**Out of memory:**
+- Reduce batch size
+- Enable CPU offloading
+- Reduce LM batch chunk size
+**LM not working:**
+- Ensure "Initialize 5Hz LM" was checked during initialization
+- Check that a valid LM model path is selected
+- Verify vllm or PyTorch backend is available
+---
+## Keyboard Shortcuts
+The Gradio interface supports standard web shortcuts:
+- **Tab** - Move between input fields
+- **Enter** - Submit text inputs
+- **Space** - Toggle checkboxes
+---
+## Language Support
+The interface supports multiple UI languages:
+- **English** (en)
+- **Chinese** (zh)
+- **Japanese** (ja)
+Select your preferred language in the Service Configuration section.
+---
+For more information, see:
+- Main README: [`../../README.md`](../../README.md)
+- REST API Documentation: [`API.md`](API.md)
+- Python Inference API: [`INFERENCE.md`](INFERENCE.md)

INFERENCE.md → docs/en/INFERENCE.md RENAMED Viewed

@@ -1,5 +1,9 @@
 # ACE-Step Inference API Documentation
 This document provides comprehensive documentation for the ACE-Step inference API, including parameter specifications for all supported task types.
 ## Table of Contents
@@ -9,6 +13,7 @@ This document provides comprehensive documentation for the ACE-Step inference AP
 - [GenerationParams Parameters](#generationparams-parameters)
 - [GenerationConfig Parameters](#generationconfig-parameters)
 - [Task Types](#task-types)
 - [Complete Examples](#complete-examples)
 - [Best Practices](#best-practices)
@@ -71,7 +76,9 @@ else:
 ## API Overview
-### Main Function
 ```python
 def generate_music(
@@ -84,6 +91,63 @@ def generate_music(
 ) -> GenerationResult
 ```
 ### Configuration Objects
 The API uses two configuration dataclasses:
@@ -123,6 +187,9 @@ class GenerationParams:
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     repainting_start: float = 0.0
     repainting_end: float = -1
@@ -165,7 +232,9 @@ class GenerationConfig:
     audio_format: str = "flac"
 ```
-### Result Object
 ```python
 @dataclass
@@ -196,6 +265,67 @@ Each item in `audios` list contains:
 }
 ```
 ---
 ## GenerationParams Parameters
@@ -222,7 +352,7 @@ Each item in `audios` list contains:
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `inference_steps` | `int` | `8` | Number of denoising steps. Turbo model: 1-8 (recommended 8). Base model: 1-100 (recommended 32-64). Higher = better quality but slower. |
 | `guidance_scale` | `float` | `7.0` | Classifier-free guidance scale (1.0-15.0). Higher values increase adherence to text prompt. Only supported for non-turbo model. Typical range: 5.0-9.0. |
 | `seed` | `int` | `-1` | Random seed for reproducibility. Use `-1` for random seed, or any positive integer for fixed seed. |
@@ -233,6 +363,9 @@ Each item in `audios` list contains:
 | `use_adg` | `bool` | `False` | Use Adaptive Dual Guidance (base model only). Improves quality at the cost of speed. |
 | `cfg_interval_start` | `float` | `0.0` | CFG application start ratio (0.0-1.0). Controls when to start applying classifier-free guidance. |
 | `cfg_interval_end` | `float` | `1.0` | CFG application end ratio (0.0-1.0). Controls when to stop applying classifier-free guidance. |
 ### Task-Specific Parameters
@@ -475,6 +608,132 @@ params = GenerationParams(
 ---
 ## Complete Examples
 ### Example 1: Simple Text-to-Music Generation
@@ -528,7 +787,95 @@ config = GenerationConfig(batch_size=1)
 result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
 ```
-### Example 3: Style Cover with LM Reasoning
 ```python
 params = GenerationParams(
@@ -551,24 +898,7 @@ if result.extra_outputs.get("lm_metadata"):
     print(f"LM detected Key: {lm_meta.get('keyscale')}")
 ```
-### Example 4: Repaint Section of Audio
-```python
-params = GenerationParams(
-    task_type="repaint",
-    src_audio="generated_track.mp3",
-    repainting_start=15.0,  # Start at 15 seconds
-    repainting_end=25.0,    # End at 25 seconds
-    caption="dramatic orchestral buildup",
-    inference_steps=32,  # Higher quality for base model
-)
-config = GenerationConfig(batch_size=1)
-result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
-```
-### Example 5: Batch Generation with Specific Seeds
 ```python
 params = GenerationParams(
@@ -591,7 +921,7 @@ if result.success:
         print(f"  Seed {audio['params']['seed']}: {audio['path']}")
 ```
-### Example 6: High-Quality Generation (Base Model)
 ```python
 params = GenerationParams(
@@ -602,6 +932,7 @@ params = GenerationParams(
     use_adg=True,           # Adaptive Dual Guidance
     cfg_interval_start=0.0,
     cfg_interval_end=1.0,
     seed=42,                # Reproducible results
 )
@@ -614,54 +945,25 @@ config = GenerationConfig(
 result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
 ```
-### Example 7: Extract Vocals from Mix
-```python
-params = GenerationParams(
-    task_type="extract",
-    src_audio="full_song_mix.mp3",
-    instruction="Extract the vocals track from the audio:",
-)
-config = GenerationConfig(batch_size=1)
-result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
-if result.success:
-    print(f"Extracted vocals: {result.audios[0]['path']}")
-```
-### Example 8: Add Guitar Track (Lego)
 ```python
-params = GenerationParams(
-    task_type="lego",
-    src_audio="drums_and_bass.mp3",
-    instruction="Generate the guitar track based on the audio context:",
-    caption="funky rhythm guitar with wah-wah effect",
-    repainting_start=0.0,
-    repainting_end=-1,  # Full duration
-)
-config = GenerationConfig(batch_size=1)
-result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
-```
-### Example 9: Instrumental Generation
-```python
-params = GenerationParams(
-    task_type="text2music",
-    caption="upbeat electronic dance music",
-    instrumental=True,  # Force instrumental output
-    duration=120,
-    bpm=128,
 )
-config = GenerationConfig(batch_size=2)
-result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
 ```
 ---
@@ -697,12 +999,13 @@ caption="fast slow music"  # Conflicting tempos
 - Use base model with `inference_steps=64` or higher
 - Enable `use_adg=True`
 - Set `guidance_scale=7.0-9.0`
 - Use lossless audio format (`audio_format="wav"`)
 **For Speed**:
 - Use turbo model with `inference_steps=8`
 - Disable ADG (`use_adg=False`)
-- Lower `guidance_scale=5.0-7.0`
 - Use compressed format (`audio_format="mp3"`) or default FLAC
 **For Consistency**:
@@ -808,6 +1111,9 @@ if result.success:
 **Issue**: Seeds not being respected
 - **Solution**: Set `use_random_seed=False` in config and provide `seeds` list or `seed` in params
 ---
 ## API Reference Summary
@@ -852,7 +1158,16 @@ class GenerationResult:
 ## Version History
-- **v1.5.1**: Current version with refactored inference API
   - Split `GenerationConfig` into `GenerationParams` and `GenerationConfig`
   - Renamed parameters for consistency (`key_scale` → `keyscale`, `time_signature` → `timesignature`, `audio_duration` → `duration`, `use_llm_thinking` → `thinking`, `audio_code_string` → `audio_codes`)
   - Added `instrumental` parameter
@@ -864,7 +1179,7 @@ class GenerationResult:
   - Simplified `GenerationResult` structure with unified `audios` list
   - Added unified `time_costs` in `extra_outputs`
-- **v1.5**: Previous version
   - Introduced `GenerationConfig` and `GenerationResult` dataclasses
   - Simplified parameter passing
   - Added comprehensive documentation
@@ -872,6 +1187,7 @@ class GenerationResult:
 ---
 For more information, see:
-- Main README: [`README.md`](README.md)
 - REST API Documentation: [`API.md`](API.md)
 - Project repository: [ACE-Step-1.5](https://github.com/yourusername/ACE-Step-1.5)

 # ACE-Step Inference API Documentation
+**Language / 语言 / 言語:** [English](INFERENCE.md) | [中文](../zh/INFERENCE.md) | [日本語](../ja/INFERENCE.md)
+---
 This document provides comprehensive documentation for the ACE-Step inference API, including parameter specifications for all supported task types.
 ## Table of Contents
 - [GenerationParams Parameters](#generationparams-parameters)
 - [GenerationConfig Parameters](#generationconfig-parameters)
 - [Task Types](#task-types)
+- [Helper Functions](#helper-functions)
 - [Complete Examples](#complete-examples)
 - [Best Practices](#best-practices)
 ## API Overview
+### Main Functions
+#### generate_music
 ```python
 def generate_music(
 ) -> GenerationResult
 ```
+Main function for generating music using the ACE-Step model.
+#### understand_music
+```python
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult
+```
+Analyze audio semantic codes and extract metadata (caption, lyrics, BPM, key, etc.).
+#### create_sample
+```python
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult
+```
+Generate a complete music sample (caption, lyrics, metadata) from a natural language description.
+#### format_sample
+```python
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult
+```
+Format and enhance user-provided caption and lyrics, generating structured metadata.
 ### Configuration Objects
 The API uses two configuration dataclasses:
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
+    shift: float = 1.0                    # NEW: Timestep shift factor
+    infer_method: str = "ode"             # NEW: Diffusion inference method
+    timesteps: Optional[List[float]] = None  # NEW: Custom timesteps
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_format: str = "flac"
 ```
+### Result Objects
+**GenerationResult** - Result of music generation:
 ```python
 @dataclass
 }
 ```
+**UnderstandResult** - Result of music understanding:
+```python
+@dataclass
+class UnderstandResult:
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
+**CreateSampleResult** - Result of sample creation:
+```python
+@dataclass
+class CreateSampleResult:
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
+**FormatSampleResult** - Result of sample formatting:
+```python
+@dataclass
+class FormatSampleResult:
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
 ---
 ## GenerationParams Parameters
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
+| `inference_steps` | `int` | `8` | Number of denoising steps. Turbo model: 1-20 (recommended 8). Base model: 1-200 (recommended 32-64). Higher = better quality but slower. |
 | `guidance_scale` | `float` | `7.0` | Classifier-free guidance scale (1.0-15.0). Higher values increase adherence to text prompt. Only supported for non-turbo model. Typical range: 5.0-9.0. |
 | `seed` | `int` | `-1` | Random seed for reproducibility. Use `-1` for random seed, or any positive integer for fixed seed. |
 | `use_adg` | `bool` | `False` | Use Adaptive Dual Guidance (base model only). Improves quality at the cost of speed. |
 | `cfg_interval_start` | `float` | `0.0` | CFG application start ratio (0.0-1.0). Controls when to start applying classifier-free guidance. |
 | `cfg_interval_end` | `float` | `1.0` | CFG application end ratio (0.0-1.0). Controls when to stop applying classifier-free guidance. |
+| `shift` | `float` | `1.0` | Timestep shift factor (range 1.0-5.0, default 1.0). When != 1.0, applies `t = shift * t / (1 + (shift - 1) * t)` to timesteps. Recommended 3.0 for turbo models. |
+| `infer_method` | `str` | `"ode"` | Diffusion inference method. `"ode"` (Euler) is faster and deterministic. `"sde"` (stochastic) may produce different results with variance. |
+| `timesteps` | `Optional[List[float]]` | `None` | Custom timesteps as a list of floats from 1.0 to 0.0 (e.g., `[0.97, 0.76, 0.615, 0.5, 0.395, 0.28, 0.18, 0.085, 0]`). If provided, overrides `inference_steps` and `shift`. |
 ### Task-Specific Parameters
 ---
+## Helper Functions
+### understand_music
+Analyze audio codes to extract metadata about the music.
+```python
+from acestep.inference import understand_music
+result = understand_music(
+    llm_handler=llm_handler,
+    audio_codes="<|audio_code_123|><|audio_code_456|>...",
+    temperature=0.85,
+    use_constrained_decoding=True,
+)
+if result.success:
+    print(f"Caption: {result.caption}")
+    print(f"Lyrics: {result.lyrics}")
+    print(f"BPM: {result.bpm}")
+    print(f"Key: {result.keyscale}")
+    print(f"Duration: {result.duration}s")
+    print(f"Language: {result.language}")
+else:
+    print(f"Error: {result.error}")
+```
+**Use Cases**:
+- Analyze existing music
+- Extract metadata from audio codes
+- Reverse-engineer generation parameters
+---
+### create_sample
+Generate a complete music sample from a natural language description. This is the "Simple Mode" / "Inspiration Mode" feature.
+```python
+from acestep.inference import create_sample
+result = create_sample(
+    llm_handler=llm_handler,
+    query="a soft Bengali love song for a quiet evening",
+    instrumental=False,
+    vocal_language="bn",  # Optional: constrain to Bengali
+    temperature=0.85,
+)
+if result.success:
+    print(f"Caption: {result.caption}")
+    print(f"Lyrics: {result.lyrics}")
+    print(f"BPM: {result.bpm}")
+    print(f"Duration: {result.duration}s")
+    print(f"Key: {result.keyscale}")
+    print(f"Is Instrumental: {result.instrumental}")
+    # Use with generate_music
+    params = GenerationParams(
+        caption=result.caption,
+        lyrics=result.lyrics,
+        bpm=result.bpm,
+        duration=result.duration,
+        keyscale=result.keyscale,
+        vocal_language=result.language,
+    )
+else:
+    print(f"Error: {result.error}")
+```
+**Parameters**:
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `query` | `str` | required | Natural language description of desired music |
+| `instrumental` | `bool` | `False` | Whether to generate instrumental music |
+| `vocal_language` | `Optional[str]` | `None` | Constrain lyrics to specific language (e.g., "en", "zh", "bn") |
+| `temperature` | `float` | `0.85` | Sampling temperature |
+| `top_k` | `Optional[int]` | `None` | Top-k sampling (None disables) |
+| `top_p` | `Optional[float]` | `None` | Top-p sampling (None disables) |
+| `repetition_penalty` | `float` | `1.0` | Repetition penalty |
+| `use_constrained_decoding` | `bool` | `True` | Use FSM-based constrained decoding |
+---
+### format_sample
+Format and enhance user-provided caption and lyrics, generating structured metadata.
+```python
+from acestep.inference import format_sample
+result = format_sample(
+    llm_handler=llm_handler,
+    caption="Latin pop, reggaeton",
+    lyrics="[Verse 1]\nBailando en la noche...",
+    user_metadata={"bpm": 95},  # Optional: constrain specific values
+    temperature=0.85,
+)
+if result.success:
+    print(f"Enhanced Caption: {result.caption}")
+    print(f"Formatted Lyrics: {result.lyrics}")
+    print(f"BPM: {result.bpm}")
+    print(f"Duration: {result.duration}s")
+    print(f"Key: {result.keyscale}")
+    print(f"Detected Language: {result.language}")
+else:
+    print(f"Error: {result.error}")
+```
+**Parameters**:
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `caption` | `str` | required | User's caption/description |
+| `lyrics` | `str` | required | User's lyrics with structure tags |
+| `user_metadata` | `Optional[Dict]` | `None` | Constrain specific metadata values (bpm, duration, keyscale, timesignature, language) |
+| `temperature` | `float` | `0.85` | Sampling temperature |
+| `top_k` | `Optional[int]` | `None` | Top-k sampling (None disables) |
+| `top_p` | `Optional[float]` | `None` | Top-p sampling (None disables) |
+| `repetition_penalty` | `float` | `1.0` | Repetition penalty |
+| `use_constrained_decoding` | `bool` | `True` | Use FSM-based constrained decoding |
+---
 ## Complete Examples
 ### Example 1: Simple Text-to-Music Generation
 result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
 ```
+### Example 3: Using Custom Timesteps
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="jazz fusion with complex harmonies",
+    # Custom 9-step schedule
+    timesteps=[0.97, 0.76, 0.615, 0.5, 0.395, 0.28, 0.18, 0.085, 0],
+    thinking=True,
+)
+config = GenerationConfig(batch_size=1)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### Example 4: Using Shift Parameter (Turbo Model)
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="upbeat electronic dance music",
+    inference_steps=8,
+    shift=3.0,  # Recommended for turbo models
+    infer_method="ode",
+)
+config = GenerationConfig(batch_size=2)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### Example 5: Simple Mode with create_sample
+```python
+from acestep.inference import create_sample, GenerationParams, GenerationConfig, generate_music
+# Step 1: Create sample from description
+sample = create_sample(
+    llm_handler=llm_handler,
+    query="energetic K-pop dance track with catchy hooks",
+    vocal_language="ko",
+)
+if sample.success:
+    # Step 2: Generate music using the sample
+    params = GenerationParams(
+        caption=sample.caption,
+        lyrics=sample.lyrics,
+        bpm=sample.bpm,
+        duration=sample.duration,
+        keyscale=sample.keyscale,
+        vocal_language=sample.language,
+        thinking=True,
+    )
+    config = GenerationConfig(batch_size=2)
+    result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### Example 6: Format and Enhance User Input
+```python
+from acestep.inference import format_sample, GenerationParams, GenerationConfig, generate_music
+# Step 1: Format user input
+formatted = format_sample(
+    llm_handler=llm_handler,
+    caption="rock ballad",
+    lyrics="[Verse]\nIn the darkness I find my way...",
+)
+if formatted.success:
+    # Step 2: Generate with enhanced input
+    params = GenerationParams(
+        caption=formatted.caption,
+        lyrics=formatted.lyrics,
+        bpm=formatted.bpm,
+        duration=formatted.duration,
+        keyscale=formatted.keyscale,
+        thinking=True,
+        use_cot_metas=False,  # Already formatted, skip metas CoT
+    )
+    config = GenerationConfig(batch_size=2)
+    result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### Example 7: Style Cover with LM Reasoning
 ```python
 params = GenerationParams(
     print(f"LM detected Key: {lm_meta.get('keyscale')}")
 ```
+### Example 8: Batch Generation with Specific Seeds
 ```python
 params = GenerationParams(
         print(f"  Seed {audio['params']['seed']}: {audio['path']}")
 ```
+### Example 9: High-Quality Generation (Base Model)
 ```python
 params = GenerationParams(
     use_adg=True,           # Adaptive Dual Guidance
     cfg_interval_start=0.0,
     cfg_interval_end=1.0,
+    shift=3.0,              # Timestep shift
     seed=42,                # Reproducible results
 )
 result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
 ```
+### Example 10: Understand Audio from Codes
 ```python
+from acestep.inference import understand_music
+# Analyze audio codes (e.g., from a previous generation)
+result = understand_music(
+    llm_handler=llm_handler,
+    audio_codes="<|audio_code_10695|><|audio_code_54246|>...",
+    temperature=0.85,
 )
+if result.success:
+    print(f"Detected Caption: {result.caption}")
+    print(f"Detected Lyrics: {result.lyrics}")
+    print(f"Detected BPM: {result.bpm}")
+    print(f"Detected Key: {result.keyscale}")
+    print(f"Detected Duration: {result.duration}s")
+    print(f"Detected Language: {result.language}")
 ```
 ---
 - Use base model with `inference_steps=64` or higher
 - Enable `use_adg=True`
 - Set `guidance_scale=7.0-9.0`
+- Set `shift=3.0` for better timestep distribution
 - Use lossless audio format (`audio_format="wav"`)
 **For Speed**:
 - Use turbo model with `inference_steps=8`
 - Disable ADG (`use_adg=False`)
+- Use `infer_method="ode"` (default)
 - Use compressed format (`audio_format="mp3"`) or default FLAC
 **For Consistency**:
 **Issue**: Seeds not being respected
 - **Solution**: Set `use_random_seed=False` in config and provide `seeds` list or `seed` in params
+**Issue**: Custom timesteps not working
+- **Solution**: Ensure timesteps are a list of floats from 1.0 to 0.0, properly ordered
 ---
 ## API Reference Summary
 ## Version History
+- **v1.5.2**: Current version
+  - Added `shift` parameter for timestep shifting
+  - Added `infer_method` parameter for ODE/SDE selection
+  - Added `timesteps` parameter for custom timestep schedules
+  - Added `understand_music()` function for audio analysis
+  - Added `create_sample()` function for simple mode generation
+  - Added `format_sample()` function for input enhancement
+  - Added `UnderstandResult`, `CreateSampleResult`, `FormatSampleResult` dataclasses
+- **v1.5.1**: Previous version
   - Split `GenerationConfig` into `GenerationParams` and `GenerationConfig`
   - Renamed parameters for consistency (`key_scale` → `keyscale`, `time_signature` → `timesignature`, `audio_duration` → `duration`, `use_llm_thinking` → `thinking`, `audio_code_string` → `audio_codes`)
   - Added `instrumental` parameter
   - Simplified `GenerationResult` structure with unified `audios` list
   - Added unified `time_costs` in `extra_outputs`
+- **v1.5**: Initial version
   - Introduced `GenerationConfig` and `GenerationResult` dataclasses
   - Simplified parameter passing
   - Added comprehensive documentation
 ---
 For more information, see:
+- Main README: [`../../README.md`](../../README.md)
 - REST API Documentation: [`API.md`](API.md)
+- Gradio Demo Guide: [`GRADIO_GUIDE.md`](GRADIO_GUIDE.md)
 - Project repository: [ACE-Step-1.5](https://github.com/yourusername/ACE-Step-1.5)

docs/ja/API.md ADDED Viewed

	@@ -0,0 +1,570 @@

+# ACE-Step API クライアントドキュメント
+**Language / 语言 / 言語:** [English](../en/API.md) | [中文](../zh/API.md) | [日本語](API.md)
+---
+本サービスはHTTPベースの非同期音楽生成APIを提供します。
+**基本的なワークフロー**：
+1. `POST /v1/music/generate` を呼び出してタスクを送信し、`job_id` を取得します。
+2. `GET /v1/jobs/{job_id}` を呼び出してタスクステータスをポーリングし、`status` が `succeeded` または `failed` になるまで待ちます。
+3. 結果で返された `GET /v1/audio?path=...` URL から音声ファイルをダウンロードします。
+---
+## 目次
+- [タスクステータスの説明](#1-タスクステータスの説明)
+- [生成タスクの作成](#2-生成タスクの作成)
+- [タスク結果の照会](#3-タスク結果の照会)
+- [ランダムサンプル生成](#4-ランダムサンプル生成)
+- [利用可能なモデルの一覧](#5-利用可能なモデルの一覧)
+- [音声ファイルのダウンロード](#6-音声ファイルのダウンロード)
+- [ヘルスチェック](#7-ヘルスチェック)
+- [環境変数](#8-環境変数)
+---
+## 1. タスクステータスの説明
+タスクステータス（`status`）には以下の種類があります：
+- `queued`：タスクがキューに入り、実行待ちです。この時点で `queue_position` と `eta_seconds` を確認できます。
+- `running`：生成が進行中です。
+- `succeeded`：生成が成功し、結果は `result` フィールドにあります。
+- `failed`：生成が失敗し、エラー情報は `error` フィールドにあります。
+---
+## 2. 生成タスクの作成
+### 2.1 API 定義
+- **URL**：`/v1/music/generate`
+- **メソッド**：`POST`
+- **Content-Type**：`application/json`、`multipart/form-data`、または `application/x-www-form-urlencoded`
+### 2.2 リクエストパラメータ
+#### パラメータ命名規則
+APIはほとんどのパラメータで **snake_case** と **camelCase** の両方の命名をサポートしています。例：
+- `audio_duration` / `duration` / `audioDuration`
+- `key_scale` / `keyscale` / `keyScale`
+- `time_signature` / `timesignature` / `timeSignature`
+- `sample_query` / `sampleQuery` / `description` / `desc`
+- `use_format` / `useFormat` / `format`
+また、メタデータはネストされたオブジェクト（`metas`、`metadata`、または `user_metadata`）で渡すことができます。
+#### 方法 A：JSONリクエスト（application/json）
+テキストパラメータのみを渡す場合、またはサーバー上に既に存在する音声ファイルパスを参照する場合に適しています。
+**基本パラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `caption` | string | `""` | 音楽の説明プロンプト |
+| `lyrics` | string | `""` | 歌詞の内容 |
+| `thinking` | bool | `false` | 5Hz LMを使用してオーディオコードを生成するかどうか（lm-dit動作）|
+| `vocal_language` | string | `"en"` | 歌詞の言語（en、zh、jaなど）|
+| `audio_format` | string | `"mp3"` | 出力形式（mp3、wav、flac）|
+**サンプル/説明モードパラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `sample_mode` | bool | `false` | ランダムサンプル生成モードを有効にする（LM経由でcaption/lyrics/metasを自動生成）|
+| `sample_query` | string | `""` | サンプル生成のための自然言語の説明（例：「静かな夜のための柔らかいベンガルのラブソング」）。別名：`description`、`desc` |
+| `use_format` | bool | `false` | LMを使用して提供されたcaptionとlyricsを強化/フォーマットする。別名：`format` |
+**マルチモデルサポート**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `model` | string | null | 使用するDiTモデルを選択（例：`"acestep-v15-turbo"`、`"acestep-v15-turbo-rl"`）。`/v1/models` で利用可能なモデルを一覧表示。指定しない場合はデフォルトモデルを使用。|
+**thinkingのセマンティクス（重要）**：
+- `thinking=false`：
+  - サーバーは5Hz LMを使用して `audio_code_string` を生成**しません**。
+  - DiTは **text2music** モードで実行され、提供された `audio_code_string` を**無視**します。
+- `thinking=true`：
+  - サーバーは5Hz LMを使用して `audio_code_string` を生成します（lm-dit動作）。
+  - DiTはLM生成のコードで実行され、音楽品質が向上します。
+**メタデータの自動補完（条件付き）**：
+`use_cot_caption=true` または `use_cot_language=true` またはメタデータフィールドが欠落している場合、サーバーは `caption`/`lyrics` に基づいて5Hz LMを呼び出し、欠落���ているフィールドを補完することがあります：
+- `bpm`
+- `key_scale`
+- `time_signature`
+- `audio_duration`
+ユーザー提供の値が常に優先されます。LMは空/欠落しているフィールドのみを補完します。
+**音楽属性パラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `bpm` | int | null | テンポ（BPM）を指定、範囲30-300 |
+| `key_scale` | string | `""` | キー/スケール（例：「C Major」、「Am」）。別名：`keyscale`、`keyScale` |
+| `time_signature` | string | `""` | 拍子記号（2、3、4、6はそれぞれ2/4、3/4、4/4、6/8）。別名：`timesignature`、`timeSignature` |
+| `audio_duration` | float | null | 生成時間（秒）、範囲10-600。別名：`duration`、`target_duration` |
+**オーディオコード（オプション）**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `audio_code_string` | string または string[] | `""` | `llm_dit` 用のオーディオセマンティックトークン（5Hz）。別名：`audioCodeString` |
+**生成制御パラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `inference_steps` | int | `8` | 推論ステップ数。Turboモデル：1-20（推奨8）。Baseモデル：1-200（推奨32-64）|
+| `guidance_scale` | float | `7.0` | プロンプトガイダンス係数。baseモデルのみ有効 |
+| `use_random_seed` | bool | `true` | ランダムシードを使用するかどうか |
+| `seed` | int | `-1` | シードを指定（use_random_seed=falseの場合）|
+| `batch_size` | int | `2` | バッチ生成数（最大8）|
+**高度なDiTパラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `shift` | float | `3.0` | タイムステップシフト係数（範囲1.0-5.0）。baseモデルのみ有効、turboモデルには無効 |
+| `infer_method` | string | `"ode"` | 拡散推論方法：`"ode"`（Euler、より高速）または `"sde"`（確率的）|
+| `timesteps` | string | null | カンマ区切りのカスタムタイムステップ（例：`"0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0"`）。`inference_steps` と `shift` をオーバーライド |
+| `use_adg` | bool | `false` | 適応デュアルガイダンスを使用（baseモデルのみ）|
+| `cfg_interval_start` | float | `0.0` | CFG適用開始比率（0.0-1.0）|
+| `cfg_interval_end` | float | `1.0` | CFG適用終了比率（0.0-1.0）|
+**5Hz LMパラメータ（オプション、サーバー側）**：
+これらのパラメータは5Hz LMサンプリングを制御し、メタデータの自動補完と（`thinking=true` の場合）コード生成に使用されます。
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `lm_model_path` | string | null | 5Hz LMチェックポイントディレクトリ名（例：`acestep-5Hz-lm-0.6B-v3`）|
+| `lm_backend` | string | `"vllm"` | `vllm` または `pt` |
+| `lm_temperature` | float | `0.85` | サンプリング温度 |
+| `lm_cfg_scale` | float | `2.5` | CFGスケール（>1でCFGを有効化）|
+| `lm_negative_prompt` | string | `"NO USER INPUT"` | CFGで使用するネガティブプロンプト |
+| `lm_top_k` | int | null | Top-k（0/nullで無効）|
+| `lm_top_p` | float | `0.9` | Top-p（>=1は無効として扱われる）|
+| `lm_repetition_penalty` | float | `1.0` | 繰り返しペナルティ |
+**LM CoT（思考の連鎖）パラメータ**：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `use_cot_caption` | bool | `true` | LMにCoT推論で入力captionを書き換え/強化させる。別名：`cot_caption`、`cot-caption` |
+| `use_cot_language` | bool | `true` | LMにCoTでボーカル言語を検出させる。別名：`cot_language`、`cot-language` |
+| `constrained_decoding` | bool | `true` | 構造化されたLM出力のためのFSMベースの制約付きデコーディングを有効にする。別名：`constrainedDecoding`、`constrained` |
+| `constrained_decoding_debug` | bool | `false` | 制約付きデコーディングのデバッグログを有効にする |
+**編集/参照オーディオパラメータ**（サーバー上の絶対パスが必要）：
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `reference_audio_path` | string | null | 参照オーディオパス（スタイル転送）|
+| `src_audio_path` | string | null | ソースオーディオパス（リペイント/カバー）|
+| `task_type` | string | `"text2music"` | タスクタイプ：`text2music`、`cover`、`repaint`、`lego`、`extract`、`complete` |
+| `instruction` | string | auto | 編集指示（提供されない場合はtask_typeに基づいて自動生成）|
+| `repainting_start` | float | `0.0` | リペイント開始時間（秒）|
+| `repainting_end` | float | null | リペイント終了時間（秒）、-1でオーディオの終端 |
+| `audio_cover_strength` | float | `1.0` | カバー強度（0.0-1.0）。スタイル転送には小さい値（0.2）を使用 |
+#### 方法 B：ファイルアップロード（multipart/form-data）
+参照またはソースオーディオとしてローカルオーディオファイルをアップロードする必要がある場合に使用します。
+上記のすべてのフィールドをフォームフィールドとしてサポートすることに加えて、以下のファイルフィールドもサポートしています：
+- `reference_audio`：（ファイル）参照オーディオファイルをアップロード
+- `src_audio`：（ファイル）ソースオーディオファイルをアップロード
+> **注意**：ファイルをアップロードすると、対応する `_path` パラメータは自動的に無視され、システムはアップロード後の一時ファイルパスを使用します。
+### 2.3 レスポンス例
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "queue_position": 1
+}
+```
+### 2.4 使用例（cURL）
+**基本的なJSONメソッド**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "アップビートなポップソング",
+    "lyrics": "Hello world",
+    "inference_steps": 8
+  }'
+```
+**thinking=trueの場合（LMがコードを生成 + 欠落メタを補完）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "アップビートなポップソング",
+    "lyrics": "Hello world",
+    "thinking": true,
+    "lm_temperature": 0.85,
+    "lm_cfg_scale": 2.5
+  }'
+```
+**説明駆動型生成（sample_query）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "sample_query": "静かな夜のための柔らかいベンガルのラブソング",
+    "thinking": true
+  }'
+```
+**フォーマット強化（use_format=true）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "ポップロック",
+    "lyrics": "[Verse 1]\n街を歩いて...",
+    "use_format": true,
+    "thinking": true
+  }'
+```
+**特定のモデルを選択**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "エレクトロニックダンスミュージック",
+    "model": "acestep-v15-turbo-rl",
+    "thinking": true
+  }'
+```
+**カスタムタイムステップを使用**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "ジャズピアノトリオ",
+    "timesteps": "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+    "thinking": true
+  }'
+```
+**thinking=falseの場合（DiTのみ、ただし欠落メタを補完）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "ゆっくりとした感情的なバラード",
+    "lyrics": "...",
+    "thinking": false,
+    "bpm": 72
+  }'
+```
+**ファイルアップロードメソッド**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -F "caption=この曲をリミックス" \
+  -F "src_audio=@/path/to/local/song.mp3" \
+  -F "task_type=repaint"
+```
+---
+## 3. タスク結果の照会
+### 3.1 API 定義
+- **URL**：`/v1/jobs/{job_id}`
+- **メソッド**：`GET`
+### 3.2 レスポンスパラメータ
+レスポンスには基本的なタスク情報、キューステータス、最終結果が含まれます。
+**主要フィールド**：
+- `status`：現在のステータス
+- `queue_position`：現在のキュー位置（0は実行中または完了を意味）
+- `eta_seconds`：推定残り待ち時間（秒）
+- `avg_job_seconds`：平均ジョブ時間（ETA推定用）
+- `result`：成功時の結果オブジェクト
+  - `audio_paths`：生成されたオーディオファイルURLのリスト（`/v1/audio` エンドポイントと併用）
+  - `first_audio_path`：最初のオーディオパス（URL）
+  - `second_audio_path`：2番目のオーディオパス（URL、batch_size >= 2の場合）
+  - `generation_info`：生成パラメータの詳細
+  - `status_message`：簡潔な結果説明
+  - `seed_value`：使用されたシード値（カンマ区切り）
+  - `metas`：完全なメタデータ辞書
+  - `bpm`：検出/使用されたBPM
+  - `duration`：検出/使用された長さ
+  - `keyscale`：検出/使用されたキー
+  - `timesignature`：検出/使用された拍子
+  - `genres`：検出されたジャンル（利用可能な場合）
+  - `lm_model`：使用されたLMモデルの名前
+  - `dit_model`：使���されたDiTモデルの名前
+- `error`：失敗時のエラー情報
+### 3.3 レスポンス例
+**キュー中**：
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "created_at": 1700000000.0,
+  "queue_position": 5,
+  "eta_seconds": 25.0,
+  "avg_job_seconds": 5.0,
+  "result": null,
+  "error": null
+}
+```
+**実行成功**：
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "succeeded",
+  "created_at": 1700000000.0,
+  "started_at": 1700000001.0,
+  "finished_at": 1700000010.0,
+  "queue_position": 0,
+  "result": {
+    "first_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+    "second_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3",
+    "audio_paths": [
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3"
+    ],
+    "generation_info": "🎵 2つのオーディオを生成\n⏱️ 合計：8.5s\n🎲 シード：12345,67890",
+    "status_message": "✅ 生成が正常に完了しました！",
+    "seed_value": "12345,67890",
+    "metas": {
+      "bpm": 120,
+      "duration": 30,
+      "keyscale": "C Major",
+      "timesignature": "4",
+      "caption": "キャッチーなメロディのアップビートなポップソング"
+    },
+    "bpm": 120,
+    "duration": 30,
+    "keyscale": "C Major",
+    "timesignature": "4",
+    "genres": null,
+    "lm_model": "acestep-5Hz-lm-0.6B-v3",
+    "dit_model": "acestep-v15-turbo-rl"
+  },
+  "error": null
+}
+```
+---
+## 4. ランダムサンプル生成
+### 4.1 API 定義
+- **URL**：`/v1/music/random`
+- **メソッド**：`POST`
+このエンドポイントは5Hz LM経由でcaption、lyrics、メタデータを自動生成するサンプルモードジョブを作成します。
+### 4.2 リクエストパラメータ
+| パラメータ名 | 型 | デフォルト | 説明 |
+| :--- | :--- | :--- | :--- |
+| `thinking` | bool | `true` | LM経由でオーディオコードも生成するかどうか |
+### 4.3 レスポンス例
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "queue_position": 1
+}
+```
+### 4.4 使用例
+```bash
+curl -X POST http://localhost:8001/v1/music/random \
+  -H 'Content-Type: application/json' \
+  -d '{"thinking": true}'
+```
+---
+## 5. 利用可能なモデルの一覧
+### 5.1 API 定義
+- **URL**：`/v1/models`
+- **メソッド**：`GET`
+サーバーにロードされている利用可能なDiTモデルのリストを返します。
+### 5.2 レスポンス例
+```json
+{
+  "models": [
+    {
+      "name": "acestep-v15-turbo-rl",
+      "is_default": true
+    },
+    {
+      "name": "acestep-v15-turbo",
+      "is_default": false
+    }
+  ],
+  "default_model": "acestep-v15-turbo-rl"
+}
+```
+### 5.3 使用例
+```bash
+curl http://localhost:8001/v1/models
+```
+---
+## 6. 音声ファイルのダウンロード
+### 6.1 API 定義
+- **URL**：`/v1/audio`
+- **メソッド**：`GET`
+パスで生成されたオーディオファイルをダウンロードします。
+### 6.2 リクエストパラメータ
+| パラメータ名 | 型 | 説明 |
+| :--- | :--- | :--- |
+| `path` | string | URLエンコードされたオーディオファイルパス |
+### 6.3 使用例
+```bash
+# ジョブ結果のURLを使用してダウンロード
+curl "http://localhost:8001/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3" -o output.mp3
+```
+---
+## 7. ヘルスチェック
+### 7.1 API 定義
+- **URL**：`/health`
+- **メソッド**：`GET`
+サービスのヘルスステータスを返します。
+### 7.2 レスポンス例
+```json
+{
+  "status": "ok",
+  "service": "ACE-Step API",
+  "version": "1.0"
+}
+```
+---
+## 8. 環境変数
+APIサーバーは環境変数で設定できます：
+| 変数 | デフォルト | 説明 |
+| :--- | :--- | :--- |
+| `ACESTEP_API_HOST` | `127.0.0.1` | サーバーバインドホスト |
+| `ACESTEP_API_PORT` | `8001` | サーバーバインドポート |
+| `ACESTEP_CONFIG_PATH` | `acestep-v15-turbo-rl` | プライマリDiTモデルパス |
+| `ACESTEP_CONFIG_PATH2` | （空）| セカンダリDiTモデルパス（オプション）|
+| `ACESTEP_CONFIG_PATH3` | （空）| 3番目のDiTモデルパス（オプション）|
+| `ACESTEP_DEVICE` | `auto` | モデルロードデバイス |
+| `ACESTEP_USE_FLASH_ATTENTION` | `true` | flash attentionを有効化 |
+| `ACESTEP_OFFLOAD_TO_CPU` | `false` | アイドル時にモデルをCPUにオフロード |
+| `ACESTEP_OFFLOAD_DIT_TO_CPU` | `false` | DiTを特にCPUにオフロード |
+| `ACESTEP_LM_MODEL_PATH` | `acestep-5Hz-lm-0.6B-v3` | デフォルト5Hz LMモデル |
+| `ACESTEP_LM_BACKEND` | `vllm` | LMバックエンド（vllmまたはpt）|
+| `ACESTEP_LM_DEVICE` | （ACESTEP_DEVICEと同じ）| LMデバイス |
+| `ACESTEP_LM_OFFLOAD_TO_CPU` | `false` | LMをCPUにオフロード |
+| `ACESTEP_QUEUE_MAXSIZE` | `200` | 最大キューサ��ズ |
+| `ACESTEP_QUEUE_WORKERS` | `1` | キューワーカー数 |
+| `ACESTEP_AVG_JOB_SECONDS` | `5.0` | 初期平均ジョブ時間推定 |
+| `ACESTEP_TMPDIR` | `.cache/acestep/tmp` | 一時ファイルディレクトリ |
+---
+## エラー処理
+**HTTPステータスコード**：
+- `200`：成功
+- `400`：無効なリクエスト（不正なJSON、フィールドの欠落）
+- `404`：ジョブが見つからない
+- `415`：サポートされていないContent-Type
+- `429`：サーバービジー（キューが満杯）
+- `500`：内部サーバーエラー
+**エラーレスポンス形式**：
+```json
+{
+  "detail": "問題を説明するエラーメッセージ"
+}
+```
+---
+## ベストプラクティス
+1. **`thinking=true` を使用** してLM強化生成で最高品質の結果を得る。
+2. **`sample_query`/`description` を使用** して自然言語の説明から素早く生成。
+3. **`use_format=true` を使用** してcaption/lyricsがあるがLMに強化してもらいたい場合。
+4. **ジョブステータスのポーリング** は適切な間隔（例：1-2秒ごと）で行い、サーバーの過負荷を避ける。
+5. **`avg_job_seconds` を確認** してレスポンスで待ち時間を推定。
+6. **マルチモデルサポートを使用** するには `ACESTEP_CONFIG_PATH2` と `ACESTEP_CONFIG_PATH3` 環境変数を設定し、`model` パラメータで選択。
+7. **本番環境** では常に適切なContent-Typeヘッダーを設定して415エラーを回避。

docs/ja/GRADIO_GUIDE.md ADDED Viewed

	@@ -0,0 +1,551 @@

+# ACE-Step Gradio デモユーザーガイド
+**Language / 语言 / 言語:** [English](../en/GRADIO_GUIDE.md) | [中文](../zh/GRADIO_GUIDE.md) | [日本語](GRADIO_GUIDE.md)
+---
+本ガイドはACE-Step Gradio Webインターフェースを使用した音楽生成の包括的なドキュメントを提供し、すべての機能と設定を含みます。
+## 目次
+- [はじめに](#はじめに)
+- [サービス設定](#サービス設定)
+- [生成モード](#生成モード)
+- [タスクタイプ](#タスクタイプ)
+- [入力パラメータ](#入力パラメータ)
+- [高度な設定](#高度な設定)
+- [結果セクション](#結果セクション)
+- [LoRAトレーニング](#loraトレーニング)
+- [ヒントとベストプラクティス](#ヒントとベストプラクティス)
+---
+## はじめに
+### デモの起動
+```bash
+# 基本的な起動
+python app.py
+# 事前初期化付き
+python app.py --config acestep-v15-turbo-rl --init-llm
+# 特定のポートで
+python app.py --port 7860
+```
+### インターフェース概要
+Gradioインターフェースは以下の主要セクションで構成されています：
+1. **サービス設定** - モデルの読み込みと初期化
+2. **必須入力** - タスクタイプ、オーディオアップロード、生成モード
+3. **音楽キャプションと歌詞** - 生成用のテキスト入力
+4. **オプションパラメータ** - BPM、キー、durationなどのメタデータ
+5. **高度な設定** - 生成の細かい制御
+6. **結果** - 生成されたオーディオの再生と管理
+---
+## サービス設定
+### モデル選択
+| 設定 | 説明 |
+|---------|-------------|
+| **チェックポイントファイル** | トレーニング済みモデルチェックポイントを選択（利用可能な場合）|
+| **メインモデルパス** | DiTモデル設定を選択（例：`acestep-v15-turbo`、`acestep-v15-turbo-rl`）|
+| **デバイス** | 処理デバイス：`auto`（推奨）、`cuda`、または `cpu` |
+### 5Hz LM設定
+| 設定 | 説明 |
+|---------|-------------|
+| **5Hz LMモデルパス** | 言語モデルを選択（例：`acestep-5Hz-lm-0.6B`、`acestep-5Hz-lm-0.6B-v3`）|
+| **5Hz LMバックエンド** | `vllm`（より高速、推奨）または `pt`（PyTorch、互換性が高い）|
+| **5Hz LMを初期化** | 初期化時にLMを読み込むためにチェック（thinkingモードに必要）|
+### パフォーマンスオプション
+| 設定 | 説明 |
+|---------|-------------|
+| **Flash Attentionを使用** | より高速な推論のために有効化（flash_attnパッケージが必要）|
+| **CPUにオフロード** | アイドル時にモデルをCPUにオフロードしてGPUメモリを節約 |
+| **DiTをCPUにオフロード** | DiTモデルを特にCPUにオフロード |
+### LoRAアダプター
+| 設定 | 説明 |
+|---------|-------------|
+| **LoRAパス** | トレーニング済みLoRAアダプターディレクトリへのパス |
+| **LoRAを読み込み** | 指定されたLoRAアダプターを読み込み |
+| **アンロード** | 現在読み込まれているLoRAを削除 |
+| **LoRAを使用** | 推論用の読み込まれたLoRAを有効化/無効化 |
+### 初期化
+**サービスを初期化** をクリックしてモデルを読み込みます。ステータスボックスに進捗と確認が表示されます。
+---
+## 生成モード
+### シンプルモード
+シンプルモードは、迅速な自然言語ベースの音楽生成用に設計されています。
+**使用方法：**
+1. 生成モードラジオボタンで「シンプル」を選択
+2. 「曲の説明」フィールドに自然言語の説明を入力
+3. ボーカルが不要な場合は「インストゥルメンタル」をオプションでチェック
+4. オプションで希望するボーカル言語を選択
+5. **サンプルを作成** をクリックしてcaption、歌詞、メタデータを生成
+6. 展開されたセクションで生成されたコンテンツを確認
+7. **音楽を生成** をクリックしてオーディオを作成
+**説明の例：**
+- 「静かな夜のための柔らかいベンガルのラブソング」
+- 「重いベースドロップのアップビートなエレクトロニックダンスミュージック」
+- 「アコースティックギターのメランコリックなインディーフォーク」
+- 「煙たいバーで演奏するジャズトリオ」
+**ランダムサンプル：** 🎲 ボタンをクリックしてランダムな例の説明を読み込みます。
+### カスタムモード
+カスタムモードはすべての生成パラメータの完全な制御を提供します。
+**使用方法：**
+1. 生成モードラジオボタンで「カスタム」を選択
+2. Captionと歌詞フィールドを手動で入力
+3. オプションのメタデータを設定（BPM、キー、Durationなど）
+4. オプ���ョンで **フォーマット** をクリックしてLMを使用して入力を強化
+5. 必要に応じて高度な設定を構成
+6. **音楽を生成** をクリックしてオーディオを作成
+---
+## タスクタイプ
+### text2music（デフォルト）
+テキスト説明および/または歌詞から音楽を生成。
+**ユースケース：** プロンプトに基づいて新しい音楽をゼロから作成。
+**必須入力：** Captionまたは歌詞（少なくとも1つ）
+### cover
+既存のオーディオを構造を維持しながらスタイルを変更して変換。
+**ユースケース：** 異なるスタイルのカバーバージョンを作成。
+**必須入力：**
+- ソースオーディオ（オーディオアップロードセクションでアップロード）
+- ターゲットスタイルを説明するCaption
+**重要なパラメータ：** `オーディオカバー強度`（0.0-1.0）
+- 高い値は元の構造をより多く維持
+- 低い値はより創造的な自由を許可
+### repaint
+オーディオの特定の時間セグメントを再生成。
+**ユースケース：** 生成された音楽の特定のセクションを修正または変更。
+**必須入力：**
+- ソースオーディオ
+- リペイント開始（秒）
+- リペイント終了（秒、ファイル終端には-1）
+- 希望するコンテンツを説明するCaption
+### lego（Baseモデルのみ）
+既存のオーディオのコンテキストで特定の楽器トラックを生成。
+**ユースケース：** バッキングトラックに楽器レイヤーを追加。
+**必須入力：**
+- ソースオーディオ
+- トラック名（ドロップダウンから選択）
+- トラック特性を説明するCaption
+**利用可能なトラック：** vocals、backing_vocals、drums、bass、guitar、keyboard、percussion、strings、synth、fx、brass、woodwinds
+### extract（Baseモデルのみ）
+ミックスオーディオから特定の楽器トラックを抽出/分離。
+**ユースケース：** ステム分離、楽器の分離。
+**必須入力：**
+- ソースオーディオ
+- 抽出するトラック名
+### complete（Baseモデルのみ）
+指定された楽器で部分的なトラックを完成。
+**ユースケース：** 不完全な作品の自動アレンジ。
+**必須入力：**
+- ソースオーディオ
+- トラック名（複数選択）
+- 希望するスタイルを説明するCaption
+---
+## 入力パラメータ
+### 必須入力
+#### タスクタイプ
+ドロップダウンから生成タスクを選択。選択されたタスクに基づいて指示フィールドが自動的に更新されます。
+#### オーディオアップロード
+| フィールド | 説明 |
+|-------|-------------|
+| **参照オーディオ** | スタイル参照用のオプションオーディオ |
+| **ソースオーディオ** | cover、repaint、lego、extract、completeタスクに必須 |
+| **コードに変換** | ソースオーディオから5Hzセマンティックコードを抽出 |
+#### LMコードヒント
+事前計算されたオーディオセマンティックコードをここに貼り付けて生成をガイドできます。**トランスクライブ** ボタンを使用してコードを分析しメタデータを抽出します。
+### 音楽キャプション
+希望する音楽のテキスト説明。以下について具体的に：
+- ジャンルとスタイル
+- 楽器
+- ムードと雰囲気
+- テンポ感（BPMを指定しない場合）
+**例：** 「エレキギター、力強いドラム、キャッチーなシンセフックのアップビートなポップロック」
+🎲 をクリックしてランダムな例のcaptionを読み込みます。
+### 歌詞
+構造タグ付きの歌詞を入力：
+```
+[Verse 1]
+今日街を歩いていて
+君が言っていた言葉を思い出していた
+[Chorus]
+前に進んでいく、強くいる
+ここが僕の居場所
+[Verse 2]
+...
+```
+**インストゥルメンタルチェックボックス：** これをチェックすると、歌詞の内容に関係なくインストゥルメンタル音楽を生成します。
+**ボーカル言語：** ボーカルの言語を選択。自動検出またはインストゥルメンタルトラックには「unknown」を使用。
+**フォーマットボタン：** クリックして5Hz LMを使用してcaptionと歌詞を強化。
+### オプションパラメータ
+| パラメータ | デフォルト | 説明 |
+|-----------|---------|-------------|
+| **BPM** | 自動 | 1分あたりのビート数（30-300）|
+| **キースケール** | 自動 | 音楽キー（例：「C Major」、「Am」、「F# minor」）|
+| **拍子記号** | 自動 | 拍子記号：2（2/4）、3（3/4）、4（4/4）、6（6/8）|
+| **オーディオ長** | 自動/-1 | 目標長（秒）（10-600）。-1で自動 |
+| **バッチサイズ** | 2 | 生成するオーディオバリエーションの数（1-8）|
+---
+## 高度な設定
+### DiTパラメータ
+| パラメータ | デフォルト | 説明 |
+|-----------|---------|-------------|
+| **推論ステップ** | 8 | デノイズステップ。Turbo：1-20、Base：1-200 |
+| **ガイダンススケール** | 7.0 | CFG強度（baseモデルのみ）。高い = プロンプトにより従う |
+| **シード** | -1 | ランダムシード。バッチにはカンマ区切りの値を使用 |
+| **ランダムシード** | ✓ | チェック時にランダムシードを生成 |
+| **オーディオ形式** | mp3 | 出力形式：mp3、flac |
+| **シフト** | 3.0 | タイムステップシフト係数（1.0-5.0）。turboには3.0推奨 |
+| **推論方法** | ode | ode（Euler、より高速）またはsde（確率的）|
+| **カスタムタイムステップ** | - | タイムステップをオーバーライド（例：「0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0」）|
+### Baseモデルのみのパラメータ
+| パラメータ | デフォルト | 説明 |
+|-----------|---------|-------------|
+| **ADGを使用** | ✗ | より良い品質のために適応デュアルガイダンスを有効化 |
+| **CFG区間開始** | 0.0 | CFGを適用し始めるタイミング（0.0-1.0）|
+| **CFG区間終了** | 1.0 | CFGの適用を停止するタイミング（0.0-1.0）|
+### LMパラメータ
+| パラメータ | デフォルト | 説明 |
+|-----------|---------|-------------|
+| **LM温度** | 0.85 | サンプリング温度（0.0-2.0）。高い = より創造的 |
+| **LM CFGスケール** | 2.0 | LMガイダンス強度（1.0-3.0）|
+| **LM Top-K** | 0 | Top-Kサンプリング。0で無効 |
+| **LM Top-P** | 0.9 | 核サンプリング（0.0-1.0）|
+| **LMネガティブプロンプト** | "NO USER INPUT" | CFG用のネガティブプロンプト |
+### CoT（思考の連鎖）オプション
+| オプション | デフォルト | 説明 |
+|--------|---------|-------------|
+| **CoT Metas** | ✓ | LM推論でメタデータを生成 |
+| **CoT Language** | ✓ | LMでボーカル言語を検出 |
+| **制約付きデコーディングデバッグ** | ✗ | デバッグログを有効化 |
+### 生成オプション
+| オプション | デフォルト | 説明 |
+|--------|---------|-------------|
+| **LMコード強度** | 1.0 | LMコードが生成に与える影響の強さ（0.0-1.0）|
+| **自動スコア** | ✗ | 品質スコアを自動計算 |
+| **自動LRC** | ✗ | 歌詞タイムスタンプを自動生成 |
+| **LMバッチチャンクサイズ** | 8 | LMバッチあたりの最大アイテム数（GPUメモリ）|
+### メイン生成コントロール
+| コントロール | 説明 |
+|---------|-------------|
+| **Think** | コード生成とメタデータ用の5Hz LMを有効化 |
+| **ParallelThinking** | 並列LMバッチ処理を有効化 |
+| **CaptionRewrite** | LMに入力captionを強化させる |
+| **AutoGen** | 完了後に次のバッチを自動開始 |
+---
+## 結果セクション
+### 生成されたオーディオ
+バッチサイズに基づいて最大8つのオーディオサンプルが表示されます。各サンプルには以下が含まれます：
+- **オーディオプレーヤー** - 生成されたオーディオの再生、一時停止、ダウンロード
+- **ソースに送信** - このオーディオをソースオーディオ入力に送信してさらに処理
+- **保存** - オーディオとメタデータをJSONファイルに保存
+- **スコア** - パープレキシティベースの品質スコアを計算
+- **LRC** - 歌詞タイムスタンプを生成（LRC形式）
+### 詳細アコーディオン
+「スコア & LRC & LMコード」をクリックして展開し、以下を表示：
+- **LMコード** - このサンプルの5Hzセマンティックコード
+- **品質スコア** - パープレキシティベースの品質メトリック
+- **歌詞タイムスタンプ** - LRC形式のタイミングデータ
+### バッチナビゲーション
+| コントロール | 説明 |
+|---------|-------------|
+| **◀ 前へ** | 前のバッチを表示 |
+| **バッチインジケーター** | 現在のバッチ位置を表示（例：「バッチ 1 / 3」）|
+| **次バッチステータス** | バックグラウンド生成の進捗を表示 |
+| **次へ ▶** | 次のバッチを表示（AutoGenがオンの場合は生成をトリガー）|
+### パラメータの復元
+**これらの設定をUIに適用** をクリックして、現在のバッチからすべての生成パラメータを入力フィールドに復元。良い結果を反復するのに便利。
+### バッチ結果
+「バッチ結果と生成詳細」アコーディオンには以下が含まれます：
+- **すべての生成ファイル** - すべてのバッチからすべてのファイルをダウンロード
+- **生成詳細** - 生成プロセスに関する詳細情報
+---
+## LoRAトレーニング
+LoRAトレーニングタブはカスタムLoRAアダプターを作成するためのツールを提供します。
+### データセットビルダータブ
+#### ステップ1：読み込みまたはスキャン
+**オプションA：既存のデータセットを読み込み**
+1. 以前保存したデータセットJSONへのパスを入力
+2. **読み込み** をクリック
+**オプションB：新しいディレクトリをスキャン**
+1. オーディオフォルダへのパスを入力
+2. **スキャン** をクリックしてオーディオファイルを検索（wav、mp3、flac、ogg、opus）
+#### ステップ2：データセットの設定
+| 設定 | 説明 |
+|---------|-------------|
+| **データセット名** | データセットの名前 |
+| **すべてインストゥルメンタル** | すべてのトラックにボーカルがない場合にチェック |
+| **カスタムアクティベーションタグ** | このLoRAのスタイルをアクティブにするユニークなタグ |
+| **タグ位置** | タグを配置する場所：前に追加、後に追加、またはcaptionを置換 |
+#### ステップ3：自動ラベル
+**すべて自動ラベル** をクリックしてすべてのオーディオファイルのメタデータを生成：
+- Caption（音楽の説明）
+- BPM
+- キー
+- 拍子記号
+**Metasをスキップ** オプションはLLMラベリングをスキップしてN/A値を使用します。
+#### ステップ4：プレビューと編集
+スライダーを使用してサンプルを選択し、手動で編集：
+- Caption
+- 歌詞
+- BPM、キー、拍子記号
+- 言語
+- インストゥルメンタルフラグ
+**変更を保存** をクリックしてサンプルを更新。
+#### ステップ5：データセットを保存
+保存パスを入力し、**データセットを保存** をクリックしてJSONとしてエクスポート。
+#### ステップ6：前処理
+高速トレーニングのためにデータセットを事前計算テンソルに変換：
+1. オプションで既存のデータセットJSONを読み込み
+2. テンソル出力ディレクトリを設定
+3. **前処理** をクリック
+これによりオーディオがVAE潜在変数にエンコードされ、テキストが埋め込みにエンコードされ、条件エンコーダーが実行されます。
+### LoRAトレーニングタブ
+#### データセット選択
+前処理されたテンソルディレクトリへのパスを入力し、**データセットを読み込み** をクリック。
+#### LoRA設定
+| 設定 | デフォルト | 説明 |
+|---------|---------|-------------|
+| **LoRAランク (r)** | 64 | LoRAの容量。高い = より多くの容量、より多くのメモリ |
+| **LoRA Alpha** | 128 | スケーリング係数（通常はランクの2倍）|
+| **LoRA Dropout** | 0.1 | 正則化のためのドロップアウト率 |
+#### トレーニングパラメータ
+| 設定 | デフォルト | 説明 |
+|---------|---------|-------------|
+| **学習率** | 1e-4 | 最適化学習率 |
+| **最大エポック** | 500 | 最大トレーニングエポック |
+| **バッチサイズ** | 1 | トレーニングバッチサイズ |
+| **勾配累積** | 1 | 有効バッチ = batch_size × accumulation |
+| **Nエポックごとに保存** | 200 | チェックポイント保存頻度 |
+| **シフト** | 3.0 | turboモデルのタイムステップシフト |
+| **シード** | 42 | 再現性のためのランダムシード |
+#### トレーニングコントロール
+- **トレーニング開始** - トレーニングプロセスを開始
+- **トレーニング停止** - トレーニングを中断
+- **トレーニング進捗** - 現在のエポックとロスを表示
+- **トレーニングログ** - 詳細なトレーニング出力
+- **トレーニングロスプロット** - 視覚的なロス曲線
+#### LoRAのエクスポート
+トレーニング後、最終アダプターをエクスポート：
+1. エクスポートパスを入力
+2. **LoRAをエクスポート** をクリック
+---
+## ヒントとベストプラクティス
+### 最高品質のために
+1. **thinkingモードを使用** - LM強化生成のために「Think」チェックボックスを有効に保つ
+2. **captionを具体的に** - ジャンル、楽器、ムード、スタイルの詳細を含める
+3. **LMにメタデータを検出させる** - 自動検出のためにBPM/キー/Durationを空のままにする
+4. **バッチ生成を使用** - 2-4のバリエーションを生成し、最良のものを選ぶ
+### より高速な生成のために
+1. **turboモデルを使用** - `acestep-v15-turbo` または `acestep-v15-turbo-rl` を選択
+2. **推論ステップを8に保つ** - turboに最適なデフォルト
+3. **バッチサイズを減らす** - 迅速な結果が必要な場合はバッチサイズを下げる
+4. **AutoGenを無効化** - バッチ生成の手動制御
+### 一貫した結果のために
+1. **特定のシードを設定** - 「ランダムシード」のチェックを外してシード値を入力
+2. **良い結果を保存** - 再現のためにパラメータをエクスポートするために「保存」を使用
+3. **「これらの設定を適用」を使用** - 良いバッチからパラメータを復元
+### 長尺音楽のために
+1. **明示的なdurationを設定** - 秒単位でdurationを指定
+2. **repaintタスクを使用** - 初期生成後に問題のあるセクションを修正
+3. **生成をチェーン** - 以前の結果の上に構築するために「ソースに送信」を使用
+### スタイルの一貫性のために
+1. **LoRAをトレーニング** - あなたのスタイル用のカスタムアダプターを作成
+2. **参照オーディオを使用** - オーディオアップロードでスタイル参照をアップロード
+3. **一貫したcaptionを使用** - 類似の説明的な言語を維持
+### トラブルシューティング
+**オーディオが生成されない：**
+- モデルが初期化されていることを確認（緑のステータスメッセージ）
+- thinkingモードを使用している場合は5Hz LMが初期化されていることを確認
+- エラーメッセージのステータス出力を確認
+**結果の品質が悪い：**
+- 推論ステップを増やす（baseモデルの場合）
+- ガイダンススケールを調整
+- 異なるシードを試す
+- captionをより具体的にする
+**メモリ不足：**
+- バッチサイズを減らす
+- CPUオフロードを有効化
+- LMバッチチャンクサイズを減らす
+**LMが機能しない：**
+- 初期化時に「5Hz LMを初期化」がチェックされていたことを確認
+- 有効なLMモデルパスが選択されていることを確認
+- vllmまたはPyTorchバックエンドが利用可能であることを確認
+---
+## キーボードショートカット
+Gradioインターフェースは標準的なWebショートカットをサポート：
+- **Tab** - 入力フィールド間を移動
+- **Enter** - テキスト入力を送信
+- **Space** - チェックボックスを切り替え
+---
+## 言語サポート
+インターフェースは複数のUI言語をサポート：
+- **英語** (en)
+- **中国語** (zh)
+- **日本語** (ja)
+サービス設定セクションで好みの言語を選択してください。
+---
+詳細については以下を参照：
+- メインREADME：[`../../README.md`](../../README.md)
+- REST APIドキュメント：[`API.md`](API.md)
+- Python推論API：[`INFERENCE.md`](INFERENCE.md)

docs/ja/INFERENCE.md ADDED Viewed

	@@ -0,0 +1,739 @@

+# ACE-Step 推論 API ドキュメント
+**Language / 语言 / 言語:** [English](../en/INFERENCE.md) | [中文](../zh/INFERENCE.md) | [日本語](INFERENCE.md)
+---
+本ドキュメントはACE-Step推論APIの包括的なドキュメントを提供し、サポートされているすべてのタスクタイプのパラメータ仕様を含みます。
+## 目次
+- [クイックスタート](#クイックスタート)
+- [API概要](#api概要)
+- [GenerationParamsパラメータ](#generationparamsパラメータ)
+- [GenerationConfigパラメータ](#generationconfigパラメータ)
+- [タスクタイプ](#タスクタイプ)
+- [ヘルパー関数](#ヘルパー関数)
+- [完全な例](#完全な例)
+- [ベストプラクティス](#ベストプラクティス)
+---
+## クイックスタート
+### 基本的な使用法
+```python
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+from acestep.inference import GenerationParams, GenerationConfig, generate_music
+# ハンドラーの初期化
+dit_handler = AceStepHandler()
+llm_handler = LLMHandler()
+# サービスの初期化
+dit_handler.initialize_service(
+    project_root="/path/to/project",
+    config_path="acestep-v15-turbo-rl",
+    device="cuda"
+)
+llm_handler.initialize(
+    checkpoint_dir="/path/to/checkpoints",
+    lm_model_path="acestep-5Hz-lm-0.6B-v3",
+    backend="vllm",
+    device="cuda"
+)
+# 生成パラメータの設定
+params = GenerationParams(
+    caption="重低音のアップビートなエレクトロニックダンスミュージック",
+    bpm=128,
+    duration=30,
+)
+# 生成設定の構成
+config = GenerationConfig(
+    batch_size=2,
+    audio_format="flac",
+)
+# 音楽を生成
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/path/to/output")
+# 結果にアクセス
+if result.success:
+    for audio in result.audios:
+        print(f"生成完了：{audio['path']}")
+        print(f"Key：{audio['key']}")
+        print(f"Seed：{audio['params']['seed']}")
+else:
+    print(f"エラー：{result.error}")
+```
+---
+## API概要
+### メイン関数
+#### generate_music
+```python
+def generate_music(
+    dit_handler,
+    llm_handler,
+    params: GenerationParams,
+    config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
+) -> GenerationResult
+```
+ACE-Stepモデルを使用して音楽を生成するメイン関数。
+#### understand_music
+```python
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult
+```
+オーディオセマンティックコードを分析し、メタデータ（caption、lyrics、BPM、キーなど）を抽出します。
+#### create_sample
+```python
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult
+```
+自然言語の説明から完全な音楽サンプル（caption、lyrics、メタデータ）を生成します。
+#### format_sample
+```python
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult
+```
+ユーザー提供のcaptionとlyricsをフォーマット・強化し、構造化されたメタデータを生成します。
+### 設定オブジェクト
+APIは2つの設定データクラスを使用します：
+**GenerationParams** - すべての音楽生成パラメータを含む：
+```python
+@dataclass
+class GenerationParams:
+    # タスクと指示
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # オーディオアップロード
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LMコードヒント
+    audio_codes: str = ""
+    # テキスト入力
+    caption: str = ""
+    lyrics: str = ""
+    instrumental: bool = False
+    # メタデータ
+    vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # 高度な設定
+    inference_steps: int = 8
+    seed: int = -1
+    guidance_scale: float = 7.0
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    shift: float = 1.0                    # 新規：タイムステップシフト係数
+    infer_method: str = "ode"             # 新規：拡散推論方法
+    timesteps: Optional[List[float]] = None  # 新規：カスタムタイムステップ
+    repainting_start: float = 0.0
+    repainting_end: float = -1
+    audio_cover_strength: float = 1.0
+    # 5Hz言語モデルパラメータ
+    thinking: bool = True
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.0
+    lm_top_k: int = 0
+    lm_top_p: float = 0.9
+    lm_negative_prompt: str = "NO USER INPUT"
+    use_cot_metas: bool = True
+    use_cot_caption: bool = True
+    use_cot_lyrics: bool = False
+    use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    # CoT生成値（LMによって自動入力）
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+```
+**GenerationConfig** - バッチと出力設定を含む：
+```python
+@dataclass
+class GenerationConfig:
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"
+```
+### 結果オブジェクト
+**GenerationResult** - 音楽生成の結果：
+```python
+@dataclass
+class GenerationResult:
+    # オーディオ出力
+    audios: List[Dict[str, Any]]  # オーディオ辞書のリスト
+    # 生成情報
+    status_message: str           # 生成からのステータスメッセージ
+    extra_outputs: Dict[str, Any] # 追加出力（latents、masks、lm_metadata、time_costs）
+    # 成功ステータス
+    success: bool                 # 生成が成功したかどうか
+    error: Optional[str]          # 失敗した場合のエラーメッセージ
+```
+**オーディオ辞書構造：**
+`audios` リストの各アイテムには以下が含まれます：
+```python
+{
+    "path": str,           # 保存されたオーディオへのファイルパス
+    "tensor": Tensor,      # オーディオテンソル [channels, samples]、CPU、float32
+    "key": str,            # ユニークなオーディオキー（パラメータに基づくUUID）
+    "sample_rate": int,    # サンプルレート（デフォルト：48000）
+    "params": Dict,        # このオーディオの生成パラメータ（seed、audio_codesなどを含む）
+}
+```
+---
+## GenerationParamsパラメータ
+### テキスト入力
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `caption` | `str` | `""` | 希望する音楽のテキスト説明。「リラックスしたピアノ音楽」のような単純なプロンプトや、ジャンル、ムード、楽器などを含む詳細な説明が可能。最大512文字。|
+| `lyrics` | `str` | `""` | ボーカル音楽の歌詞テキスト。インストゥルメンタルトラックには `"[Instrumental]"` を使用。複数言語をサポート。最大4096文字。|
+| `instrumental` | `bool` | `False` | Trueの場合、歌詞に関係なくインストゥルメンタル音楽を生成。|
+### 音楽メタデータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `bpm` | `Optional[int]` | `None` | 1分あたりのビート数（30-300）。`None` でLM経由の自動検出を有効化。|
+| `keyscale` | `str` | `""` | 音楽キー（例：「C Major」、「Am」、「F# minor」）。空文字列で自動検出を有効化。|
+| `timesignature` | `str` | `""` | 拍子記号（2は'2/4'、3は'3/4'、4は'4/4'、6は'6/8'）。空文字列で自動検出を有効化。|
+| `vocal_language` | `str` | `"unknown"` | ボーカルの言語コード（ISO 639-1）。サポート：`"en"`、`"zh"`、`"ja"`、`"es"`、`"fr"` など。自動検出には `"unknown"` を使用。|
+| `duration` | `float` | `-1.0` | 目標オーディオ長（秒）（10-600）。<= 0またはNoneの場合、モデルが歌詞の長さに基づいて自動選択。|
+### 生成パラメータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `inference_steps` | `int` | `8` | デノイズステップ数。Turboモデル：1-20（推奨8）。Baseモデル：1-200（推奨32-64）。高い = 品質向上だが遅い。|
+| `guidance_scale` | `float` | `7.0` | 分類器フリーガイダンススケール（1.0-15.0）。高い値はテキストプロンプトへの忠実性を増加。非turboモデルのみサポート。典型的な範囲：5.0-9.0。|
+| `seed` | `int` | `-1` | 再現性のためのランダムシード。ランダムシードには `-1`、固定シードには���意の正の整数を使用。|
+### 高度なDiTパラメータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `use_adg` | `bool` | `False` | 適応デュアルガイダンスを使用（baseモデルのみ）。速度を犠牲にして品質を向上。|
+| `cfg_interval_start` | `float` | `0.0` | CFG適用開始比率（0.0-1.0）。分類器フリーガイダンスの適用開始タイミングを制御。|
+| `cfg_interval_end` | `float` | `1.0` | CFG適用終了比率（0.0-1.0）。分類器フリーガイダンスの適用終了タイミングを制御。|
+| `shift` | `float` | `1.0` | タイムステップシフト係数（範囲1.0-5.0、デフォルト1.0）。!= 1.0の場合、タイムステップに `t = shift * t / (1 + (shift - 1) * t)` を適用。turboモデルには3.0推奨。|
+| `infer_method` | `str` | `"ode"` | 拡散推論方法。`"ode"`（Euler）はより高速で決定的。`"sde"`（確率的）は分散のある異なる結果を生成する可能性あり。|
+| `timesteps` | `Optional[List[float]]` | `None` | カスタムタイムステップ、1.0から0.0の浮動小数点リスト（例：`[0.97, 0.76, 0.615, 0.5, 0.395, 0.28, 0.18, 0.085, 0]`）。提供された場合、`inference_steps` と `shift` をオーバーライド。|
+### タスク固有パラメータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `task_type` | `str` | `"text2music"` | 生成タスクタイプ。詳細は[タスクタイプ](#タスクタイプ)セクションを参照。|
+| `instruction` | `str` | `"Fill the audio semantic mask based on the given conditions:"` | タスク固有の指示プロンプト。|
+| `reference_audio` | `Optional[str]` | `None` | スタイル転送または継続タスク用の参照オーディオファイルパス。|
+| `src_audio` | `Optional[str]` | `None` | オーディオ間タスク（cover、repaintなど）用のソースオーディオファイルパス。|
+| `audio_codes` | `str` | `""` | 事前抽出された5Hzオーディオセマンティックコード文字列。高度な使用のみ。|
+| `repainting_start` | `float` | `0.0` | リペイント開始時間（秒）（repaint/legoタスク用）。|
+| `repainting_end` | `float` | `-1` | リペイント終了時間（秒）。オーディオの終端には `-1` を使用。|
+| `audio_cover_strength` | `float` | `1.0` | オーディオカバー/コードの影響強度（0.0-1.0）。スタイル転送タスクには小さい値（0.2）を設定。|
+### 5Hz言語モデルパラメータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `thinking` | `bool` | `True` | セマンティック/音楽メタデータとコード用の5Hz言語モデル「思考の連鎖」推論を有効化。|
+| `lm_temperature` | `float` | `0.85` | LMサンプリング温度（0.0-2.0）。高い = より創造的/多様、低い = より保守的。|
+| `lm_cfg_scale` | `float` | `2.0` | LM分類器フリーガイダンススケール。高い = プロンプトへのより強い忠実性。|
+| `lm_top_k` | `int` | `0` | LM top-kサンプリング。`0` でtop-kフィルタリングを無効化。典型的な値：40-100。|
+| `lm_top_p` | `float` | `0.9` | LM核サンプリング（0.0-1.0）。`1.0` で核サンプリングを無効化。典型的な値：0.9-0.95。|
+| `lm_negative_prompt` | `str` | `"NO USER INPUT"` | LMガイダンス用のネガティブプロンプト。不要な特性を避けるのに役立つ。|
+| `use_cot_metas` | `bool` | `True` | LM CoT推論を使用してメタデータを生成（BPM、キー、duration など）。|
+| `use_cot_caption` | `bool` | `True` | LM CoT推論を使用してユーザーcaptionを改良。|
+| `use_cot_language` | `bool` | `True` | LM CoT推論を使用してボーカル言語を検出。|
+| `use_cot_lyrics` | `bool` | `False` | （将来の使用のために予約）LM CoTを使用して歌詞を生成/改良。|
+| `use_constrained_decoding` | `bool` | `True` | 構造化されたLM出力のための制約付きデコーディングを有効化。|
+---
+## GenerationConfigパラメータ
+| パラメータ | 型 | デフォルト | 説明 |
+|-----------|------|---------|-------------|
+| `batch_size` | `int` | `2` | 並列生成するサンプル数（1-8）。高い値はより多くのGPUメモリを必要とする。|
+| `allow_lm_batch` | `bool` | `False` | LMでのバッチ処理を許可。`batch_size >= 2` かつ `thinking=True` の場合により高速。|
+| `use_random_seed` | `bool` | `True` | ランダムシードを使用するかどうか。`True` で毎回異なる結果、`False` で再現可能な結果。|
+| `seeds` | `Optional[List[int]]` | `None` | バッチ生成用のシードリスト。提供された場合、batch_sizeより少なければランダムシードでパディング。単一のintも可。|
+| `lm_batch_chunk_size` | `int` | `8` | LM推論チャンクあたりの最大バッチサイズ（GPUメモリ制約）。|
+| `constrained_decoding_debug` | `bool` | `False` | 制約付きデコーディングのデバッグログを有効化。|
+| `audio_format` | `str` | `"flac"` | 出力オーディオ形式。オプション：`"mp3"`、`"wav"`、`"flac"`。高速保存のためデフォルトはFLAC。|
+---
+## タスクタイプ
+ACE-Stepは6種類の生成タスクタイプをサポートし、それぞれ特定のユースケースに最適化されています。
+### 1. Text2Music（デフォルト）
+**目的**：テキスト説明とオプションのメタデータから音楽を生成。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="エレキギターのエネルギッシュなロック音楽",
+    lyrics="[Instrumental]",  # または実際の歌詞
+    bpm=140,
+    duration=30,
+)
+```
+**必須**：
+- `caption` または `lyrics`（少なくとも1つ）
+**オプションだが推奨**：
+- `bpm`：テンポを制御
+- `keyscale`：音楽キーを制御
+- `timesignature`：リズム構造を制御
+- `duration`：長さを制御
+- `vocal_language`：ボーカル特性を制御
+**ユースケース**：
+- テキスト説明から音楽を生成
+- プロンプトからバッキングトラックを作成
+- 歌詞付きの曲を生成
+---
+### 2. Cover
+**目的**：既存のオーディオを構造を維持しながらスタイル/音色を変更して変換。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="cover",
+    src_audio="original_song.mp3",
+    caption="ジャズピアノバージョン",
+    audio_cover_strength=0.8,  # 0.0-1.0
+)
+```
+**必須**：
+- `src_audio`：ソースオーディオファイルパス
+- `caption`：希望するスタイル/変換の説明
+**オプション**：
+- `audio_cover_strength`：元のオーディオの影響を制御
+  - `1.0`：元の構造を強く維持
+  - `0.5`：バランスの取れた変換
+  - `0.1`：緩やかな解釈
+- `lyrics`：新しい歌詞（ボーカルを変更する場合）
+**ユースケース**：
+- 異なるスタイルのカバーを作成
+- メロディを維持しながら楽器編成を変更
+- ジャンル変換
+---
+### 3. Repaint
+**目的**：オーディオの特定の時間セグメントを再生成し、残りは変更しない。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="repaint",
+    src_audio="original.mp3",
+    repainting_start=10.0,  # 秒
+    repainting_end=20.0,    # 秒
+    caption="ピアノソロでスムーズなトランジション",
+)
+```
+**必須**：
+- `src_audio`：ソースオーディオファイルパス
+- `repainting_start`：開始時間（秒）
+- `repainting_end`：終了時間（秒）（ファイル終端には `-1` を使用）
+- `caption`：リペイントセクションの希望するコンテンツの説明
+**ユースケース**：
+- 生成された音楽の特定セクションを修正
+- 曲の一部にバリエーションを追加
+- スムーズなトランジションを作成
+- 問題のあるセグメントを置き換え
+---
+### 4. Lego（Baseモデルのみ）
+**目的**：既存のオーディオのコンテキストで特定の楽器トラックを生成。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="lego",
+    src_audio="backing_track.mp3",
+    instruction="Generate the guitar track based on the audio context:",
+    caption="ブルージーな感じのリードギターメロディ",
+    repainting_start=0.0,
+    repainting_end=-1,
+)
+```
+**必須**：
+- `src_audio`：ソース/バッキングオーディオパス
+- `instruction`：トラックタイプを指定する必要あり（例：「Generate the {TRACK_NAME} track...」）
+- `caption`：希望するトラック特性の説明
+**利用可能なトラック**：
+- `"vocals"`、`"backing_vocals"`、`"drums"`、`"bass"`、`"guitar"`、`"keyboard"`、
+- `"percussion"`、`"strings"`、`"synth"`、`"fx"`、`"brass"`、`"woodwinds"`
+**ユースケース**：
+- 特定の楽器トラックを追加
+- バッキングトラック上に追加の楽器をレイヤー
+- マルチトラック作品を反復的に作成
+---
+### 5. Extract（Baseモデルのみ）
+**目的**：ミックスオーディオから特定の楽器トラックを抽出/分離。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="extract",
+    src_audio="full_mix.mp3",
+    instruction="Extract the vocals track from the audio:",
+)
+```
+**必須**：
+- `src_audio`：ミックスオーディオファイルパス
+- `instruction`：抽出するトラックを指定する必要あり
+**利用可能なトラック**：Legoタスクと同じ
+**ユースケース**：
+- ステム分離
+- 特定の楽器を分離
+- リミックスを作成
+- 個別トラックを分析
+---
+### 6. Complete（Baseモデルのみ）
+**目的**：指定された楽器で部分的なトラックを完成/拡張。
+**主要パラメータ**：
+```python
+params = GenerationParams(
+    task_type="complete",
+    src_audio="incomplete_track.mp3",
+    instruction="Complete the input track with drums, bass, guitar:",
+    caption="ロックスタイルの完成",
+)
+```
+**必須**：
+- `src_audio`：不完全/部分的なトラックのパス
+- `instruction`：追加するトラックを指定する必要あり
+- `caption`：希望するスタイルの説明
+**ユースケース**：
+- 不完全な作品をアレンジ
+- バッキングトラックを追加
+- 音楽アイデアを自動完成
+---
+## ヘルパー関数
+### understand_music
+オーディオコードを分析して音楽についてのメタデータを抽出。
+```python
+from acestep.inference import understand_music
+result = understand_music(
+    llm_handler=llm_handler,
+    audio_codes="<|audio_code_123|><|audio_code_456|>...",
+    temperature=0.85,
+    use_constrained_decoding=True,
+)
+if result.success:
+    print(f"Caption：{result.caption}")
+    print(f"歌詞：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"キー：{result.keyscale}")
+    print(f"長さ：{result.duration}秒")
+    print(f"言語：{result.language}")
+else:
+    print(f"エラー：{result.error}")
+```
+**ユースケース**：
+- 既存の音楽を分析
+- オーディオコードからメタデータを抽出
+- 生成パラメータをリバースエンジニアリング
+---
+### create_sample
+自然言語の説明から完全な音楽サンプルを生成。これは「シンプルモード」/「インスピレーションモード」機能です。
+```python
+from acestep.inference import create_sample
+result = create_sample(
+    llm_handler=llm_handler,
+    query="静かな夜のための柔らかいベンガルのラブソング",
+    instrumental=False,
+    vocal_language="bn",  # オプション：ベンガル語に制限
+    temperature=0.85,
+)
+if result.success:
+    print(f"Caption：{result.caption}")
+    print(f"歌詞：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"長さ：{result.duration}秒")
+    print(f"キー：{result.keyscale}")
+    print(f"インストゥルメンタルか：{result.instrumental}")
+    # generate_musicと一緒に使用
+    params = GenerationParams(
+        caption=result.caption,
+        lyrics=result.lyrics,
+        bpm=result.bpm,
+        duration=result.duration,
+        keyscale=result.keyscale,
+        vocal_language=result.language,
+    )
+else:
+    print(f"エラー：{result.error}")
+```
+---
+### format_sample
+ユーザー提供のcaptionとlyricsをフォーマット・強化し、構造化されたメタデータを生成。
+```python
+from acestep.inference import format_sample
+result = format_sample(
+    llm_handler=llm_handler,
+    caption="ラテンポップ、レゲトン",
+    lyrics="[Verse 1]\nBailando en la noche...",
+    user_metadata={"bpm": 95},  # オプション：特定の値を制約
+    temperature=0.85,
+)
+if result.success:
+    print(f"強化されたCaption：{result.caption}")
+    print(f"フォーマットされた歌詞：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"長さ：{result.duration}秒")
+    print(f"キー：{result.keyscale}")
+    print(f"検出された言語：{result.language}")
+else:
+    print(f"エラー：{result.error}")
+```
+---
+## ベストプラクティス
+### 1. Captionの書き方
+**良いCaption**：
+```python
+# 具体的で説明的
+caption="重低音とシンセサイザーリードのアップビートなエレクトロニックダンスミュージック"
+# ムードとジャンルを含む
+caption="アコースティックギターと柔らかいボーカルのメランコリックなインディーフォーク"
+# 楽器を指定
+caption="ピアノ、アップライトベース、ブラシドラムのジャズトリオ"
+```
+**避けるべき**：
+```python
+# 曖昧すぎる
+caption="良い音楽"
+# 矛盾
+caption="速い遅い音楽"  # テンポの矛盾
+```
+### 2. パラメータチューニング
+**最高品質のために**：
+- baseモデルを使用し、`inference_steps=64` 以上
+- `use_adg=True` を有効化
+- `guidance_scale=7.0-9.0` を設定
+- より良いタイムステップ分布のために `shift=3.0` を設定
+- ロスレスオーディオ形式を使用（`audio_format="wav"`）
+**速度のために**：
+- turboモデルを使用し、`inference_steps=8`
+- ADGを無効化（`use_adg=False`）
+- `infer_method="ode"`（デフォルト）を使用
+- 圧縮形式を使用（`audio_format="mp3"`）またはデフォルトのFLAC
+**一貫性のために**：
+- configで `use_random_seed=False` を設定
+- 固定 `seeds` リストまたはparamsで単一 `seed` を使用
+- `lm_temperature` を低く保つ（0.7-0.85）
+**多様性のために**：
+- configで `use_random_seed=True` を設定
+- `lm_temperature` を増加（0.9-1.1）
+- バリエーションのために `batch_size > 1` を使用
+### 3. Durationガイドライン
+- **インストゥルメンタル**：30-180秒が適切
+- **歌詞付き**：自動検出を推奨（`duration=-1` を設定またはデフォルトのまま）
+- **短いクリップ**：最小10-20秒
+- **長尺**：最大600秒（10分）
+### 4. LMの使用
+**LMを有効にする場合（`thinking=True`）**：
+- 自動メタデータ検出が必要
+- caption改良が欲しい
+- 最小限の入力から生成
+- 多様な出力が必要
+**LMを無効にする場合（`thinking=False`）**：
+- すでに正確なメタデータがある
+- より高速な生成が必要
+- パラメータの完全な制御が欲しい
+---
+## トラブルシューティング
+### よくある問題
+**問題**：メモリ不足エラー
+- **解決策**：`batch_size`、`inference_steps` を減らすか、CPUオフロードを有効化
+**問題**：結果の品質が悪い
+- **解決策**：`inference_steps` を増やす、`guidance_scale` を調整、baseモデルを使用
+**問題**：結果がプロンプトと一致しない
+- **解決策**：captionをより具体的に、`guidance_scale` を増やす、LM改良を有効化（`thinking=True`）
+**問題**：生成が遅い
+- **解決策**：turboモデルを使用、`inference_steps` を減らす、ADGを無効化
+**問題**：LMがコードを生成しない
+- **解決策**：`llm_handler` が初期化されていることを確認、`thinking=True` と `use_cot_metas=True` を確認
+**問題**：シードが尊重されない
+- **解決策**：configで `use_random_seed=False` を設定し、`seeds` リストまたはparamsで `seed` を提供
+**問題**：カスタムタイムステップが機能しない
+- **解決策**：タイムステップが1.0から0.0の浮動小数点リストで、適切に順序付けられていることを確認
+---
+詳細については以下を参照：
+- メインREADME：[`../../README.md`](../../README.md)
+- REST APIドキュメント：[`API.md`](API.md)
+- Gradioデモガイド：[`GRADIO_GUIDE.md`](GRADIO_GUIDE.md)
+- プロジェクトリポジトリ：[ACE-Step-1.5](https://github.com/yourusername/ACE-Step-1.5)

docs/zh/API.md ADDED Viewed

	@@ -0,0 +1,570 @@

+# ACE-Step API 客户端文档
+**Language / 语言 / 言語:** [English](../en/API.md) | [中文](API.md) | [日本語](../ja/API.md)
+---
+本服务提供基于 HTTP 的异步音乐生成 API。
+**基本工作流程**：
+1. 调用 `POST /v1/music/generate` 提交任务并获取 `job_id`。
+2. 调用 `GET /v1/jobs/{job_id}` 轮询任务状态，直到 `status` 为 `succeeded` 或 `failed`。
+3. 通过结果中返回的 `GET /v1/audio?path=...` URL 下载音频文件。
+---
+## 目录
+- [任务状态说明](#1-任务状态说明)
+- [创建生成任务](#2-创建生成任务)
+- [查询任务结果](#3-查询任务结果)
+- [随机样本生成](#4-随机样本生成)
+- [列出可用模型](#5-列出可用模型)
+- [下载音频文件](#6-下载音频文件)
+- [健康检查](#7-健康检查)
+- [环境变量](#8-环境变量)
+---
+## 1. 任务状态说明
+任务状态（`status`）包括以下类型：
+- `queued`：任务已进入队列，等待执行。此时可以查看 `queue_position` 和 `eta_seconds`。
+- `running`：生成正在进行中。
+- `succeeded`：生成成功，结果在 `result` 字段中。
+- `failed`：生成失败，错误信息在 `error` 字段中。
+---
+## 2. 创建生成任务
+### 2.1 API 定义
+- **URL**：`/v1/music/generate`
+- **方法**：`POST`
+- **Content-Type**：`application/json`、`multipart/form-data` 或 `application/x-www-form-urlencoded`
+### 2.2 请求参数
+#### 参数命名约定
+API 支持大多数参数的 **snake_case** 和 **camelCase** 命名。例如：
+- `audio_duration` / `duration` / `audioDuration`
+- `key_scale` / `keyscale` / `keyScale`
+- `time_signature` / `timesignature` / `timeSignature`
+- `sample_query` / `sampleQuery` / `description` / `desc`
+- `use_format` / `useFormat` / `format`
+此外，元数据可以通过嵌套对象传递（`metas`、`metadata` 或 `user_metadata`）。
+#### 方法 A：JSON 请求（application/json）
+适用于仅传递文本参数，或引用服务器上已存在的音频文件路径。
+**基本参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `caption` | string | `""` | 音乐描述提示词 |
+| `lyrics` | string | `""` | 歌词内容 |
+| `thinking` | bool | `false` | 是否使用 5Hz LM 生成音频代码（lm-dit 行为）|
+| `vocal_language` | string | `"en"` | 歌词语言（en、zh、ja 等）|
+| `audio_format` | string | `"mp3"` | 输出格式（mp3、wav、flac）|
+**样本/描述模式参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `sample_mode` | bool | `false` | 启用随机样本生成模式（通过 LM 自动生成 caption/lyrics/metas）|
+| `sample_query` | string | `""` | 用于样本生成的自然语言描述（例如"一首柔和的孟加拉情歌"）。别名：`description`、`desc` |
+| `use_format` | bool | `false` | 使用 LM 增强/格式化提供的 caption 和 lyrics。别名：`format` |
+**多模型支持**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `model` | string | null | 选择使用哪个 DiT 模型（例如 `"acestep-v15-turbo"`、`"acestep-v15-turbo-rl"`）。使用 `/v1/models` 列出可用模型。如果未指定，使用默认模型。|
+**thinking 语义（重要）**：
+- `thinking=false`：
+  - 服务器**不会**使用 5Hz LM 生成 `audio_code_string`。
+  - DiT 以 **text2music** 模式运行，**忽略**任何提供的 `audio_code_string`。
+- `thinking=true`：
+  - 服务器将使用 5Hz LM 生成 `audio_code_string`（lm-dit 行为）。
+  - DiT 使用 LM 生成的代码运行，以增强音乐质量。
+**元数据自动补全（条件性）**：
+当 `use_cot_caption=true` 或 `use_cot_language=true` 或元数据字段缺失时，服务器可能会调用 5Hz LM 根据 `caption`/`lyrics` 填充缺失的字段：
+- `bpm`
+- `key_scale`
+- `time_signature`
+- `audio_duration`
+用户提供的值始终优先；LM 只填充空/缺失的字段。
+**音乐属性参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `bpm` | int | null | 指定节奏（BPM），范围 30-300 |
+| `key_scale` | string | `""` | 调性（例如"C Major"、"Am"）。别名：`keyscale`、`keyScale` |
+| `time_signature` | string | `""` | 拍号（2、3、4、6 分别表示 2/4、3/4、4/4、6/8）。别名：`timesignature`、`timeSignature` |
+| `audio_duration` | float | null | 生成时长（秒），范围 10-600。别名：`duration`、`target_duration` |
+**音频代码（可选）**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `audio_code_string` | string 或 string[] | `""` | 用于 `llm_dit` 的音频语义令牌（5Hz）。别名：`audioCodeString` |
+**生成控制参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `inference_steps` | int | `8` | 推理步数。Turbo 模型：1-20（推荐 8）。Base 模型：1-200（推荐 32-64）|
+| `guidance_scale` | float | `7.0` | 提示引导系数。仅对 base 模型有效 |
+| `use_random_seed` | bool | `true` | 是否使用随机种子 |
+| `seed` | int | `-1` | 指定种子（当 use_random_seed=false 时）|
+| `batch_size` | int | `2` | 批量生成数量（最多 8）|
+**高级 DiT 参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `shift` | float | `3.0` | 时间步偏移因子（范围 1.0-5.0）。仅对 base 模型有效，对 turbo 模型无效 |
+| `infer_method` | string | `"ode"` | 扩散推理方法：`"ode"`（Euler，更快）或 `"sde"`（随机）|
+| `timesteps` | string | null | 自定义时间步，逗号分隔值（例如 `"0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0"`）。覆盖 `inference_steps` 和 `shift` |
+| `use_adg` | bool | `false` | 使用自适应双引导（仅 base 模型）|
+| `cfg_interval_start` | float | `0.0` | CFG 应用起始比例（0.0-1.0）|
+| `cfg_interval_end` | float | `1.0` | CFG 应用结束比例（0.0-1.0）|
+**5Hz LM 参数（可选，服务器端）**：
+这些参数控制 5Hz LM 采样，用于元数据自动补全和（当 `thinking=true` 时）代码生成。
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `lm_model_path` | string | null | 5Hz LM 检查点目录名（例如 `acestep-5Hz-lm-0.6B-v3`）|
+| `lm_backend` | string | `"vllm"` | `vllm` 或 `pt` |
+| `lm_temperature` | float | `0.85` | 采样温度 |
+| `lm_cfg_scale` | float | `2.5` | CFG 比例（>1 启用 CFG）|
+| `lm_negative_prompt` | string | `"NO USER INPUT"` | CFG 使用的负面提示 |
+| `lm_top_k` | int | null | Top-k（0/null 禁用）|
+| `lm_top_p` | float | `0.9` | Top-p（>=1 将被视为禁用）|
+| `lm_repetition_penalty` | float | `1.0` | 重复惩罚 |
+**LM CoT（思维链）参数**：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `use_cot_caption` | bool | `true` | 让 LM 通过 CoT 推理重写/增强输入 caption。别名：`cot_caption`、`cot-caption` |
+| `use_cot_language` | bool | `true` | 让 LM 通过 CoT 检测人声语言。别名：`cot_language`、`cot-language` |
+| `constrained_decoding` | bool | `true` | 启用基于 FSM 的约束解码以获得结构化 LM 输出。别名：`constrainedDecoding`、`constrained` |
+| `constrained_decoding_debug` | bool | `false` | 启用约束解码的调试日志 |
+**编辑/参考音频参数**（需要服务器上的绝对路径）：
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `reference_audio_path` | string | null | 参考音频路径（风格迁移）|
+| `src_audio_path` | string | null | 源音频路径（重绘/翻唱）|
+| `task_type` | string | `"text2music"` | 任务类型：`text2music`、`cover`、`repaint`、`lego`、`extract`、`complete` |
+| `instruction` | string | auto | 编辑指令（如未提供则根据 task_type 自动生成）|
+| `repainting_start` | float | `0.0` | 重绘开始时间（秒）|
+| `repainting_end` | float | null | 重绘结束时间（秒），-1 表示音频末尾 |
+| `audio_cover_strength` | float | `1.0` | 翻唱强度（0.0-1.0）。风格迁移使用较小值（0.2）|
+#### 方法 B：文件上传（multipart/form-data）
+当需要上传本地音频文件作为参考或源音频时使用。
+除了支持上述所有字段作为表单字段外，还支持以下文件字段：
+- `reference_audio`：（文件）上传参考音频文件
+- `src_audio`：（文件）上传源音频文件
+> **注意**：上传文件后，相应的 `_path` 参数将被自动忽略，系统将使用上传后的临时文件路径。
+### 2.3 响应示例
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "queue_position": 1
+}
+```
+### 2.4 使用示例（cURL）
+**基本 JSON 方法**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "欢快的流行歌曲",
+    "lyrics": "你好世界",
+    "inference_steps": 8
+  }'
+```
+**使用 thinking=true（LM 生成代码 + 填充缺失元数据）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "欢快的流行歌曲",
+    "lyrics": "你好世界",
+    "thinking": true,
+    "lm_temperature": 0.85,
+    "lm_cfg_scale": 2.5
+  }'
+```
+**描述驱动生成（sample_query）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "sample_query": "一首适合安静夜晚的柔和孟加拉情歌",
+    "thinking": true
+  }'
+```
+**使用格式增强（use_format=true）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "流行摇滚",
+    "lyrics": "[Verse 1]\n走在街上...",
+    "use_format": true,
+    "thinking": true
+  }'
+```
+**选择特定模型**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "电子舞曲",
+    "model": "acestep-v15-turbo-rl",
+    "thinking": true
+  }'
+```
+**使用自定义时间步**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "爵士钢琴三重奏",
+    "timesteps": "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+    "thinking": true
+  }'
+```
+**使用 thinking=false（仅 DiT，但填充缺失元数据）**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "caption": "缓慢的情感民谣",
+    "lyrics": "...",
+    "thinking": false,
+    "bpm": 72
+  }'
+```
+**文件上传方法**：
+```bash
+curl -X POST http://localhost:8001/v1/music/generate \
+  -F "caption=重新混音这首歌" \
+  -F "src_audio=@/path/to/local/song.mp3" \
+  -F "task_type=repaint"
+```
+---
+## 3. 查询任务结果
+### 3.1 API 定义
+- **URL**：`/v1/jobs/{job_id}`
+- **方法**：`GET`
+### 3.2 响应参数
+响应包含基本任务信息、队列状态和最终结果。
+**主要字段**：
+- `status`：当前状态
+- `queue_position`：当前队列位置（0 表示正在运行或已完成）
+- `eta_seconds`：预计剩余等待时间（秒）
+- `avg_job_seconds`：平均任务持续时间（用于 ETA 估算）
+- `result`：成功时的结果对象
+  - `audio_paths`：生成的音频文件 URL 列表（配合 `/v1/audio` 端点使用）
+  - `first_audio_path`：第一个音频路径（URL）
+  - `second_audio_path`：第二个音频路径（URL，如果 batch_size >= 2）
+  - `generation_info`：生成参数详情
+  - `status_message`：简短结果描述
+  - `seed_value`：使用的种子值，逗号分隔
+  - `metas`：完整元数据字典
+  - `bpm`：检测到/使用的 BPM
+  - `duration`：检测到/使用的时长
+  - `keyscale`：检测到/使用的调性
+  - `timesignature`：检测到/使用的拍号
+  - `genres`：检测到的风格（如果可用）
+  - `lm_model`：使用的 LM 模型名称
+  - `dit_model`：使用的 DiT 模型名称
+- `error`：失败时的错误信息
+### 3.3 响应示例
+**排队中**：
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "created_at": 1700000000.0,
+  "queue_position": 5,
+  "eta_seconds": 25.0,
+  "avg_job_seconds": 5.0,
+  "result": null,
+  "error": null
+}
+```
+**执行成功**：
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "succeeded",
+  "created_at": 1700000000.0,
+  "started_at": 1700000001.0,
+  "finished_at": 1700000010.0,
+  "queue_position": 0,
+  "result": {
+    "first_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+    "second_audio_path": "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3",
+    "audio_paths": [
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3",
+      "/v1/audio?path=%2Ftmp%2Fapi_audio%2Fdef456.mp3"
+    ],
+    "generation_info": "🎵 生成了 2 个音频\n⏱️ 总计：8.5s\n🎲 种子：12345,67890",
+    "status_message": "✅ 生成成功完成！",
+    "seed_value": "12345,67890",
+    "metas": {
+      "bpm": 120,
+      "duration": 30,
+      "keyscale": "C Major",
+      "timesignature": "4",
+      "caption": "欢快的流行歌曲，旋律动听"
+    },
+    "bpm": 120,
+    "duration": 30,
+    "keyscale": "C Major",
+    "timesignature": "4",
+    "genres": null,
+    "lm_model": "acestep-5Hz-lm-0.6B-v3",
+    "dit_model": "acestep-v15-turbo-rl"
+  },
+  "error": null
+}
+```
+---
+## 4. 随机样本生成
+### 4.1 API 定义
+- **URL**：`/v1/music/random`
+- **方法**：`POST`
+此端点创建一个样本模式任务，通过 5Hz LM 自动生成 caption、lyrics 和元数据。
+### 4.2 请求参数
+| 参数名 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `thinking` | bool | `true` | 是否同时通过 LM 生成音频代码 |
+### 4.3 响应示例
+```json
+{
+  "job_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "queue_position": 1
+}
+```
+### 4.4 使用示例
+```bash
+curl -X POST http://localhost:8001/v1/music/random \
+  -H 'Content-Type: application/json' \
+  -d '{"thinking": true}'
+```
+---
+## 5. 列出可用模型
+### 5.1 API 定义
+- **URL**：`/v1/models`
+- **方法**：`GET`
+返回服务器上加载的可用 DiT 模型列表。
+### 5.2 响应示例
+```json
+{
+  "models": [
+    {
+      "name": "acestep-v15-turbo-rl",
+      "is_default": true
+    },
+    {
+      "name": "acestep-v15-turbo",
+      "is_default": false
+    }
+  ],
+  "default_model": "acestep-v15-turbo-rl"
+}
+```
+### 5.3 使用示例
+```bash
+curl http://localhost:8001/v1/models
+```
+---
+## 6. 下载音频文件
+### 6.1 API 定义
+- **URL**：`/v1/audio`
+- **方法**：`GET`
+通过路径下载生成的音频文件。
+### 6.2 请求参数
+| 参数名 | 类型 | 说明 |
+| :--- | :--- | :--- |
+| `path` | string | URL 编码的音频文件路径 |
+### 6.3 使用示例
+```bash
+# 使用任务结果中的 URL 下载
+curl "http://localhost:8001/v1/audio?path=%2Ftmp%2Fapi_audio%2Fabc123.mp3" -o output.mp3
+```
+---
+## 7. 健康检查
+### 7.1 API 定义
+- **URL**：`/health`
+- **方法**：`GET`
+返回服务健康状态。
+### 7.2 响应示例
+```json
+{
+  "status": "ok",
+  "service": "ACE-Step API",
+  "version": "1.0"
+}
+```
+---
+## 8. 环境变量
+API 服务器可以通过环境变量进行配置：
+| 变量 | 默认值 | 说明 |
+| :--- | :--- | :--- |
+| `ACESTEP_API_HOST` | `127.0.0.1` | 服务器绑定主机 |
+| `ACESTEP_API_PORT` | `8001` | 服务器绑定端口 |
+| `ACESTEP_CONFIG_PATH` | `acestep-v15-turbo-rl` | 主 DiT 模型路径 |
+| `ACESTEP_CONFIG_PATH2` | （空）| 辅助 DiT 模型路径（可选）|
+| `ACESTEP_CONFIG_PATH3` | （空）| 第三个 DiT 模型路径（可选）|
+| `ACESTEP_DEVICE` | `auto` | 模型加载设备 |
+| `ACESTEP_USE_FLASH_ATTENTION` | `true` | 启用 flash attention |
+| `ACESTEP_OFFLOAD_TO_CPU` | `false` | 空闲时将模型卸载到 CPU |
+| `ACESTEP_OFFLOAD_DIT_TO_CPU` | `false` | 专门将 DiT 卸载到 CPU |
+| `ACESTEP_LM_MODEL_PATH` | `acestep-5Hz-lm-0.6B-v3` | 默认 5Hz LM 模型 |
+| `ACESTEP_LM_BACKEND` | `vllm` | LM 后端（vllm 或 pt）|
+| `ACESTEP_LM_DEVICE` | （与 ACESTEP_DEVICE 相同）| LM 设备 |
+| `ACESTEP_LM_OFFLOAD_TO_CPU` | `false` | 将 LM 卸载到 CPU |
+| `ACESTEP_QUEUE_MAXSIZE` | `200` | 最大队列大小 |
+| `ACESTEP_QUEUE_WORKERS` | `1` | 队列工作者数量 |
+| `ACESTEP_AVG_JOB_SECONDS` | `5.0` | 初始平均任务持续时间估算 |
+| `ACESTEP_TMPDIR` | `.cache/acestep/tmp` | 临时文件目录 |
+---
+## 错误处理
+**HTTP 状态码**：
+- `200`：成功
+- `400`：无效请求（错误的 JSON、缺少字段）
+- `404`：找不到任务
+- `415`：不支持的 Content-Type
+- `429`：服务器繁忙（队列已满）
+- `500`：内部服务器错误
+**错误响应格式**：
+```json
+{
+  "detail": "描述问题的错误消息"
+}
+```
+---
+## 最佳实践
+1. **使用 `thinking=true`** 以获得 LM 增强生成的最佳质量结果。
+2. **使用 `sample_query`/`description`** 从自然语言描述快速生成。
+3. **使用 `use_format=true`** 当你有 caption/lyrics 但希望 LM 增强它们时。
+4. **轮询任务状态** 时使用合理的间隔（例如每 1-2 秒），以避免服务器过载。
+5. **检查 `avg_job_seconds`** 响应来估算等待时间。
+6. **使用多模型支持** 通过设置 `ACESTEP_CONFIG_PATH2` 和 `ACESTEP_CONFIG_PATH3` 环境变量，然后通过 `model` 参数选择。
+7. **生产环境** 中，始终设置正确的 Content-Type 头以避免 415 错误。

docs/zh/GRADIO_GUIDE.md ADDED Viewed

	@@ -0,0 +1,551 @@

+# ACE-Step Gradio 演示用户指南
+**Language / 语言 / 言語:** [English](../en/GRADIO_GUIDE.md) | [中文](GRADIO_GUIDE.md) | [日本語](../ja/GRADIO_GUIDE.md)
+---
+本指南提供使用 ACE-Step Gradio Web 界面进行音乐生成的综合文档，包括所有功能和设置。
+## 目录
+- [快速开始](#快速开始)
+- [服务配置](#服务配置)
+- [生成模式](#生成模式)
+- [任务类型](#任务类型)
+- [输入参数](#输入参数)
+- [高级设置](#高级设置)
+- [结果区域](#结果区域)
+- [LoRA 训练](#lora-训练)
+- [技巧与最佳实践](#技巧与最佳实践)
+---
+## 快速开始
+### 启动演示
+```bash
+# 基本启动
+python app.py
+# 预初始化
+python app.py --config acestep-v15-turbo-rl --init-llm
+# 指定端口
+python app.py --port 7860
+```
+### 界面概述
+Gradio 界面包含以下主要部分：
+1. **服务配置** - 模型加载和初始化
+2. **必需输入** - 任务类型、音频上传和生成模式
+3. **音乐描述和歌词** - 生成的文本输入
+4. **可选参数** - BPM、调性、时长等元数据
+5. **高级设置** - 细粒度的生成控制
+6. **结果** - 生成的音频播放和管理
+---
+## 服务配置
+### 模型选择
+| 设置 | 说明 |
+|---------|-------------|
+| **检查点文件** | 选择已训练的模型检查点（如果可用）|
+| **主模型路径** | 选择 DiT 模型配置（例如 `acestep-v15-turbo`、`acestep-v15-turbo-rl`）|
+| **设备** | 处理设备：`auto`（推荐）、`cuda` 或 `cpu` |
+### 5Hz LM 配置
+| 设置 | 说明 |
+|---------|-------------|
+| **5Hz LM 模型路径** | 选择语言模型（例如 `acestep-5Hz-lm-0.6B`、`acestep-5Hz-lm-0.6B-v3`）|
+| **5Hz LM 后端** | `vllm`（更快，推荐）或 `pt`（PyTorch，兼容性更好）|
+| **初始化 5Hz LM** | 勾选以在初始化期间加载 LM（thinking 模式必需）|
+### 性能选项
+| 设置 | 说明 |
+|---------|-------------|
+| **使用 Flash Attention** | 启用以加速推理（需要 flash_attn 包）|
+| **卸载到 CPU** | 空闲时将模型卸载到 CPU 以节省 GPU 内存 |
+| **将 DiT 卸载到 CPU** | 专门将 DiT 模型卸载到 CPU |
+### LoRA 适配器
+| 设置 | 说明 |
+|---------|-------------|
+| **LoRA 路径** | 已训练的 LoRA 适配器目录路径 |
+| **加载 LoRA** | 加载指定的 LoRA 适配器 |
+| **卸载** | 移除当前加载的 LoRA |
+| **使用 LoRA** | 启用/禁用已加载的 LoRA 进行推理 |
+### 初始化
+点击 **初始化服务** 加载模型。状态框将显示进度和确认信息。
+---
+## 生成模式
+### 简单模式
+简单模式专为快速、基于自然语言的音乐生成设计。
+**使用方法：**
+1. 在生成模式单选按钮中选择"简单"
+2. 在"歌曲描述"字段中输入自然语言描述
+3. 如果不想要人声，可选择勾选"纯音乐"
+4. 可选择首选人声语言
+5. 点击 **创建样本** 生成 caption、歌词和元数据
+6. 在展开的部分中查看生成的内容
+7. 点击 **生成音乐** 创建音频
+**示例描述：**
+- "一首适合安静夜晚的柔和孟加拉情歌"
+- "欢快的电子舞曲，重低音"
+- "忧郁的独立民谣，原声吉他"
+- "在烟雾弥漫的酒吧里演奏的爵士三重奏"
+**随机样本：** 点击 🎲 按钮加载随机示例描述。
+### 自定义模式
+自定义模式提供对所有生成参数的完全控制。
+**使用方法：**
+1. 在生成模式单选按钮中选择"自定义"
+2. 手动填写 Caption 和歌词字段
+3. 设置可选元数据（BPM、调性、时长等）
+4. 可选点击 **格式化** 使用 LM 增强您的输入
+5. 根据需要配置高级设置
+6. 点击 **生成音乐** 创建音频
+---
+## 任务类型
+### text2music（默认）
+从文本描述和/或歌词生成音乐。
+**用例：** 基于提示从头创建新音乐。
+**必需输入：** Caption 或歌词（至少一个）
+### cover
+转换现有音频，保持结构但改变风格。
+**用例：** 创建不同风格的翻唱版本。
+**必需输入：**
+- 源音频（在音频上传区域上传）
+- 描述目标风格的 Caption
+**关键参数：** `音频翻唱强度`（0.0-1.0）
+- 较高的值保持更多原始结构
+- 较低的值允许更多创意自由
+### repaint
+重新生成音频的特定时间段。
+**用例：** 修复或修改生成音乐的特定部分。
+**必需输入：**
+- 源音频
+- 重绘开始（秒）
+- 重绘结束（秒，-1 表示文件末尾）
+- 描述期望内容的 Caption
+### lego（仅 Base 模型）
+在现有音频的上下文中生成特定乐器轨道。
+**用例：** 为伴奏添加乐器层。
+**必需输入：**
+- 源音频
+- 轨道名称（从下拉菜单选择）
+- 描述轨道特征的 Caption
+**可用轨道：** vocals、backing_vocals、drums、bass、guitar、keyboard、percussion、strings、synth、fx、brass、woodwinds
+### extract（仅 Base 模型）
+从混音音频中提取/分离特定乐器轨道。
+**用例：** 音轨分离、分离乐器。
+**必需输入：**
+- 源音频
+- 要提取的轨道名称
+### complete（仅 Base 模型）
+用指定的乐器完成部分轨道。
+**用例：** 自动编排不完整的作品。
+**必需输入：**
+- 源音频
+- 轨道名称（多选）
+- 描述期望风格的 Caption
+---
+## 输入参数
+### 必需输入
+#### 任务类型
+从下拉菜单选择生成任务。指令字段会根据选择的任务自动更新。
+#### 音频上传
+| 字段 | 说明 |
+|-------|-------------|
+| **参考音频** | 用于风格参考的可选音频 |
+| **源音频** | cover、repaint、lego、extract、complete 任务必需 |
+| **转换为代码** | 从源音频提取 5Hz 语义代码 |
+#### LM 代码提示
+可以在此粘贴预计算的音频语义代码来引导生成。使用 **转录** 按钮分析代码并提取元数据。
+### 音乐描述
+期望音乐的文本描述。请具体说明：
+- 风格和类型
+- 乐器
+- 情绪和氛围
+- 节奏感（如果不指定 BPM）
+**示例：** "欢快的流行摇滚，电吉他、有力的鼓点和朗朗上口的合成器钩子"
+点击 🎲 加载随机示例 caption。
+### 歌词
+输入带结构标签的歌词：
+```
+[Verse 1]
+今天走在街上
+想着你曾说过的话
+[Chorus]
+我在前进，我很坚强
+这就是我属于的地方
+[Verse 2]
+...
+```
+**纯音乐复选框：** 勾选此项以生成纯音乐，无论歌词内容如何。
+**人声语言：** 选择人声语言。对于自动检测或纯音乐，使用"unknown"。
+**格式化按钮：** 点击使用 5Hz LM 增强 caption 和歌词。
+### 可选参数
+| 参数 | 默认值 | 说明 |
+|-----------|---------|-------------|
+| **BPM** | 自动 | 每分钟节拍数（30-300）|
+| **调性** | 自动 | 音乐调性（例如"C Major"、"Am"、"F# minor"）|
+| **拍号** | 自动 | 拍号：2（2/4）、3（3/4）、4（4/4）、6（6/8）|
+| **音频时长** | 自动/-1 | 目标长度（秒）（10-600）。-1 为自动 |
+| **批量大小** | 2 | 要生成的音频变体数量（1-8）|
+---
+## 高级设置
+### DiT 参数
+| 参数 | 默认值 | 说明 |
+|-----------|---------|-------------|
+| **推理步数** | 8 | 去噪步数。Turbo：1-20，Base：1-200 |
+| **引导比例** | 7.0 | CFG 强度（仅 base 模型）。越高 = 越遵循提示 |
+| **种子** | -1 | 随机种子。批量使用逗号分隔的值 |
+| **随机种子** | ✓ | 勾选时生成随机种子 |
+| **音频格式** | mp3 | 输出格式：mp3、flac |
+| **偏移** | 3.0 | 时间步偏移因子（1.0-5.0）。turbo 推荐 3.0 |
+| **推理方法** | ode | ode（Euler，更快）或 sde（随机）|
+| **自定义时间步** | - | 覆盖时间步（例如"0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0"）|
+### 仅 Base 模型参数
+| 参数 | 默认值 | 说明 |
+|-----------|---------|-------------|
+| **使用 ADG** | ✗ | 启用自适应双引导以获得更好的质量 |
+| **CFG 区间开始** | 0.0 | 何时开始应用 CFG（0.0-1.0）|
+| **CFG 区间结束** | 1.0 | 何时停止应用 CFG（0.0-1.0）|
+### LM 参数
+| 参数 | 默认值 | 说明 |
+|-----------|---------|-------------|
+| **LM 温度** | 0.85 | 采样温度（0.0-2.0）。越高 = 越有创意 |
+| **LM CFG 比例** | 2.0 | LM 引导强度（1.0-3.0）|
+| **LM Top-K** | 0 | Top-K 采样。0 禁用 |
+| **LM Top-P** | 0.9 | 核采样（0.0-1.0）|
+| **LM 负面提示** | "NO USER INPUT" | CFG 的负面提示 |
+### CoT（思维链）选项
+| 选项 | 默认值 | 说明 |
+|--------|---------|-------------|
+| **CoT Metas** | ✓ | 通过 LM 推理生成元数据 |
+| **CoT Language** | ✓ | 通过 LM 检测人声语言 |
+| **约束解码调试** | ✗ | 启用调试日志 |
+### 生成选项
+| 选项 | 默认值 | 说明 |
+|--------|---------|-------------|
+| **LM 代码强度** | 1.0 | LM 代码对生成的影响程度（0.0-1.0）|
+| **自动评分** | ✗ | 自动计算质量分数 |
+| **自动 LRC** | ✗ | 自动生成歌词时间戳 |
+| **LM 批处理块大小** | 8 | 每个 LM 批次的最大项目数（GPU 内存）|
+### 主要生成控制
+| 控制 | 说明 |
+|---------|-------------|
+| **Think** | 启用 5Hz LM 进行代码生成和元数据 |
+| **ParallelThinking** | 启用并行 LM 批处理 |
+| **CaptionRewrite** | 让 LM 增强输入 caption |
+| **AutoGen** | 完成后自动开始下一批次 |
+---
+## 结果区域
+### 生成的音频
+根据批量大小最多显示 8 个音频样本。每个样本包括：
+- **音频播放器** - 播放、暂停和下载生成的音频
+- **发送到源** - 将此音频发送到源音频输入以进行进一步处理
+- **保存** - 将音频和元数据保存到 JSON 文件
+- **评分** - 计算基于困惑度的质量分数
+- **LRC** - 生成歌词时间戳（LRC 格式）
+### 详情折叠面板
+点击"评分 & LRC & LM 代码"展开并查看：
+- **LM 代码** - 此样本的 5Hz 语义代码
+- **���量分数** - 基于困惑度的质量指标
+- **歌词时间戳** - LRC 格式的时间数据
+### 批次导航
+| 控制 | 说明 |
+|---------|-------------|
+| **◀ 上一批** | 查看上一批 |
+| **批次指示器** | 显示当前批次位置（例如"批次 1 / 3"）|
+| **下一批状态** | 显示后台生成进度 |
+| **下一批 ▶** | 查看下一批（如果 AutoGen 开启则触发生成）|
+### 恢复参数
+点击 **应用这些设置到 UI** 将当前批次的所有生成参数恢复到输入字段。适用于迭代优化好的结果。
+### 批次结果
+"批次结果和生成详情"折叠面板包含：
+- **所有生成的文件** - 下载所有批次的所有文件
+- **生成详情** - 关于生成过程的详细信息
+---
+## LoRA 训练
+LoRA 训练选项卡提供创建自定义 LoRA 适配器的工具。
+### 数据集构建器选项卡
+#### 步骤 1：加载或扫描
+**选项 A：加载现有数据集**
+1. 输入之前保存的数据集 JSON 路径
+2. 点击 **加载**
+**选项 B：扫描新目录**
+1. 输入音频文件夹路径
+2. 点击 **扫描** 查找音频文件（wav、mp3、flac、ogg、opus）
+#### 步骤 2：配置数据集
+| 设置 | 说明 |
+|---------|-------------|
+| **数据集名称** | 您的数据集名称 |
+| **全部纯音乐** | 如果所有曲目都没有人声，请勾选 |
+| **自定义激活标签** | 激活此 LoRA 风格的唯一标签 |
+| **标签位置** | 放置标签的位置：前置、追加或替换 caption |
+#### 步骤 3：自动标注
+点击 **自动标注全部** 为所有音频文件生成元数据：
+- Caption（音乐描述）
+- BPM
+- 调性
+- 拍号
+**跳过 Metas** 选项将跳过 LLM 标注并使用 N/A 值。
+#### 步骤 4：预览和编辑
+使用滑块选择样本并手动编辑：
+- Caption
+- 歌词
+- BPM、调性、拍号
+- 语言
+- 纯音乐标志
+点击 **保存更改** 更新样本。
+#### 步骤 5：保存数据集
+输入保存路径并点击 **保存数据集** 导出为 JSON。
+#### 步骤 6：预处理
+将数据集转换为预计算张量以加快训练：
+1. 可选加载现有数据集 JSON
+2. 设置张量输出目录
+3. 点击 **预处理**
+这会将音频编码为 VAE 潜变量，将文本编码为嵌入，并运行条件编码器。
+### 训练 LoRA 选项卡
+#### 数据集选择
+输入预处理张量目录路径并点击 **加载数据集**。
+#### LoRA 设置
+| 设置 | 默认值 | 说明 |
+|---------|---------|-------------|
+| **LoRA 秩 (r)** | 64 | LoRA 容量。越高 = 容量越大，内存越多 |
+| **LoRA Alpha** | 128 | 缩放因子（通常是秩的 2 倍）|
+| **LoRA Dropout** | 0.1 | 用于正则化的 dropout 率 |
+#### 训练参数
+| 设置 | 默认值 | 说明 |
+|---------|---------|-------------|
+| **学习率** | 1e-4 | 优化学习率 |
+| **最大 Epochs** | 500 | 最大训练 epochs |
+| **批量大小** | 1 | 训练批量大小 |
+| **梯度累积** | 1 | 有效批次 = batch_size × accumulation |
+| **每 N Epochs 保存** | 200 | 检查点保存频率 |
+| **偏移** | 3.0 | turbo 模型的时间步偏移 |
+| **种子** | 42 | 用于可重复性的随机种子 |
+#### 训练控制
+- **开始训练** - 开始训练过程
+- **停止训练** - 中断训练
+- **训练进度** - 显示当前 epoch 和损失
+- **训练日志** - 详细训练输出
+- **训练损失图** - 可视化损失曲线
+#### 导出 LoRA
+训练后，导出最终适配器：
+1. 输入导出路径
+2. 点击 **导出 LoRA**
+---
+## 技巧与最佳实践
+### 获得最佳质量
+1. **使用 thinking 模式** - 保持"Think"复选框启用以获得 LM 增强的生成
+2. **具体描述 caption** - 包含风格、乐器、情绪和风格细节
+3. **让 LM 检测元数据** - 将 BPM/调性/时长留空以自动检测
+4. **使用批量生成** - 生成 2-4 个变体并选择最好的
+### 加快生成速度
+1. **使用 turbo 模型** - 选择 `acestep-v15-turbo` 或 `acestep-v15-turbo-rl`
+2. **保持推理步数为 8** - 这是 turbo 的最佳默认值
+3. **减少批量大小** - 如果需要快速结果，降低批量大小
+4. **禁用 AutoGen** - 手动控制批次生成
+### 获得一致结果
+1. **设置特定种子** - 取消勾选"随机种子"并输入种子值
+2. **保存好的结果** - 使用"保存"导出参数以便重现
+3. **使用"应用这些设置"** - 从好的批次恢复参数
+### 长格式音乐
+1. **设置明确的时长** - 以秒为单位指定时长
+2. **使用 repaint 任务** - 初始生成后修复有问题的部分
+3. **链式生成** - 使用"发送到源"在之前的结果上构建
+### 风格一致性
+1. **训练 LoRA** - 为您的风格创建自定义适配器
+2. **使用参考音频** - 在音频上传中上传风格参考
+3. **使用一致的 caption** - 保持相似的描述性语言
+### 故障排除
+**没有生成音频：**
+- 检查模型是否已初始化（绿色状态消息）
+- 如果使用 thinking 模式，确保 5Hz LM 已初始化
+- 检查状态输出中的错误消息
+**结果质量差：**
+- 增加推理步数（对于 base 模型）
+- 调整引导比例
+- 尝试不同的种子
+- 使 caption 更具体
+**内存不足：**
+- 减少批量大小
+- 启用 CPU 卸载
+- 减少 LM 批处理块大小
+**LM 不工作：**
+- 确保初始化期间勾选了"初始化 5Hz LM"
+- 检查是否选择了有效的 LM 模型路径
+- 验证 vllm 或 PyTorch 后端可用
+---
+## 键盘快捷键
+Gradio 界面支持标准 Web 快捷键：
+- **Tab** - 在输入字段之间移动
+- **Enter** - 提交文本输入
+- **Space** - 切换复选框
+---
+## 语言支持
+界面支持多种 UI 语言：
+- **英文** (en)
+- **中文** (zh)
+- **日文** (ja)
+在服务配置区域选择您的首选语言。
+---
+更多信息，请参阅：
+- 主 README：[`../../README.md`](../../README.md)
+- REST API 文档：[`API.md`](API.md)
+- Python 推理 API：[`INFERENCE.md`](INFERENCE.md)

docs/zh/INFERENCE.md ADDED Viewed

	@@ -0,0 +1,1049 @@

+# ACE-Step 推理 API 文档
+**Language / 语言 / 言語:** [English](../en/INFERENCE.md) | [中文](INFERENCE.md) | [日本語](../ja/INFERENCE.md)
+---
+本文档提供 ACE-Step 推理 API 的综合文档，包括所有支持任务类型的参数规范。
+## 目录
+- [快速开始](#快速开始)
+- [API 概述](#api-概述)
+- [GenerationParams 参数](#generationparams-参数)
+- [GenerationConfig 参数](#generationconfig-参数)
+- [任务类型](#任务类型)
+- [辅助函数](#辅助函数)
+- [完整示例](#完整示例)
+- [最佳实践](#最佳实践)
+---
+## 快速开始
+### 基本用法
+```python
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+from acestep.inference import GenerationParams, GenerationConfig, generate_music
+# 初始化处理器
+dit_handler = AceStepHandler()
+llm_handler = LLMHandler()
+# 初始化服务
+dit_handler.initialize_service(
+    project_root="/path/to/project",
+    config_path="acestep-v15-turbo-rl",
+    device="cuda"
+)
+llm_handler.initialize(
+    checkpoint_dir="/path/to/checkpoints",
+    lm_model_path="acestep-5Hz-lm-0.6B-v3",
+    backend="vllm",
+    device="cuda"
+)
+# 配置生成参数
+params = GenerationParams(
+    caption="欢快的电子舞曲，重低音",
+    bpm=128,
+    duration=30,
+)
+# 配置生成设置
+config = GenerationConfig(
+    batch_size=2,
+    audio_format="flac",
+)
+# 生成音乐
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/path/to/output")
+# 访问结果
+if result.success:
+    for audio in result.audios:
+        print(f"已生成：{audio['path']}")
+        print(f"Key：{audio['key']}")
+        print(f"Seed：{audio['params']['seed']}")
+else:
+    print(f"错误：{result.error}")
+```
+---
+## API 概述
+### 主要函数
+#### generate_music
+```python
+def generate_music(
+    dit_handler,
+    llm_handler,
+    params: GenerationParams,
+    config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
+) -> GenerationResult
+```
+使用 ACE-Step 模型生成音乐的主函数。
+#### understand_music
+```python
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult
+```
+分析音频语义代码并提取元数据（caption、lyrics、BPM、调性等）。
+#### create_sample
+```python
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult
+```
+从自然语言描述生成完整的音乐样本（caption、lyrics、元数据）。
+#### format_sample
+```python
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult
+```
+格式化和增强用户提供的 caption 和 lyrics，生成结构化元数据。
+### 配置对象
+API 使用两个配置数据类：
+**GenerationParams** - 包含所有音乐生成参数：
+```python
+@dataclass
+class GenerationParams:
+    # 任务和指令
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # 音频上传
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM 代码提示
+    audio_codes: str = ""
+    # 文本输入
+    caption: str = ""
+    lyrics: str = ""
+    instrumental: bool = False
+    # 元数据
+    vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # 高级设置
+    inference_steps: int = 8
+    seed: int = -1
+    guidance_scale: float = 7.0
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    shift: float = 1.0                    # 新增：时间步偏移因子
+    infer_method: str = "ode"             # 新增：扩散推理方法
+    timesteps: Optional[List[float]] = None  # 新增：自定义时间步
+    repainting_start: float = 0.0
+    repainting_end: float = -1
+    audio_cover_strength: float = 1.0
+    # 5Hz 语言模型参数
+    thinking: bool = True
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.0
+    lm_top_k: int = 0
+    lm_top_p: float = 0.9
+    lm_negative_prompt: str = "NO USER INPUT"
+    use_cot_metas: bool = True
+    use_cot_caption: bool = True
+    use_cot_lyrics: bool = False
+    use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    # CoT 生成的值（由 LM 自动填充）
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+```
+**GenerationConfig** - 包含批处理和输出配置：
+```python
+@dataclass
+class GenerationConfig:
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"
+```
+### 结果对象
+**GenerationResult** - 音乐生成结果：
+```python
+@dataclass
+class GenerationResult:
+    # 音频输出
+    audios: List[Dict[str, Any]]  # 音频字典列表
+    # 生成信息
+    status_message: str           # 生成状态消息
+    extra_outputs: Dict[str, Any] # 额外输出（latents、masks、lm_metadata、time_costs）
+    # 成功状态
+    success: bool                 # 生成是否成功
+    error: Optional[str]          # 失败时的错误消息
+```
+**音频字典结构：**
+`audios` 列表中的每个项目包含：
+```python
+{
+    "path": str,           # 保存的音频文件路径
+    "tensor": Tensor,      # 音频张量 [channels, samples]，CPU，float32
+    "key": str,            # 唯一音频键（基于参数的 UUID）
+    "sample_rate": int,    # 采样率（默认：48000）
+    "params": Dict,        # 此音频的生成参数（包括 seed、audio_codes 等）
+}
+```
+**UnderstandResult** - 音乐理解结果：
+```python
+@dataclass
+class UnderstandResult:
+    # 元数据字段
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # 状态
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
+**CreateSampleResult** - 样本创建结果：
+```python
+@dataclass
+class CreateSampleResult:
+    # 元数据字段
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # 状态
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
+**FormatSampleResult** - 样本格式化结果：
+```python
+@dataclass
+class FormatSampleResult:
+    # 元数据字段
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # 状态
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+```
+---
+## GenerationParams 参数
+### 文本输入
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `caption` | `str` | `""` | 期望音乐的文本描述。可以是简单提示如"放松的钢琴音乐"，或包含风格、情绪、乐器等的详细描述。最多 512 字符。|
+| `lyrics` | `str` | `""` | 人声音乐的歌词文本。纯音乐使用 `"[Instrumental]"`。支持多种语言。最多 4096 字符。|
+| `instrumental` | `bool` | `False` | 如果为 True，无论歌词如何都生成纯音乐。|
+### 音乐元数据
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `bpm` | `Optional[int]` | `None` | 每分钟节拍数（30-300）。`None` 启用通过 LM 自动检测。|
+| `keyscale` | `str` | `""` | 音乐调性（例如"C Major"、"Am"、"F# minor"）。空字符串启用自动检测。|
+| `timesignature` | `str` | `""` | 拍号（2 表示 '2/4'，3 表示 '3/4'，4 表示 '4/4'，6 表示 '6/8'）。空字符串启用自动检测。|
+| `vocal_language` | `str` | `"unknown"` | 人声语言代码（ISO 639-1）。支持：`"en"`、`"zh"`、`"ja"`、`"es"`、`"fr"` 等。使用 `"unknown"` 自动检测。|
+| `duration` | `float` | `-1.0` | 目标音频长度（秒）（10-600）。如果 <= 0 或 None，模型根据歌词长度自动选择。|
+### 生成参数
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `inference_steps` | `int` | `8` | 去噪步数。Turbo 模型：1-20（推荐 8）。Base 模型：1-200（推荐 32-64）。越高 = 质量越好但更慢。|
+| `guidance_scale` | `float` | `7.0` | 无分类器引导比例（1.0-15.0）。较高的值增加对文本提示的遵循度。仅支持非 turbo 模型。典型范围：5.0-9.0。|
+| `seed` | `int` | `-1` | 用于可重复性的随机种子。使用 `-1` 表示随机种子，或任何正整数表示固定种子。|
+### 高级 DiT 参数
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `use_adg` | `bool` | `False` | 使用自适应双引导（仅 base 模型）。以速度为代价提高质量。|
+| `cfg_interval_start` | `float` | `0.0` | CFG 应用起始比例（0.0-1.0）。控制何时开始应用无分类器引导。|
+| `cfg_interval_end` | `float` | `1.0` | CFG 应用结束比例（0.0-1.0）。控制何时停止应用无分类器引导。|
+| `shift` | `float` | `1.0` | 时间步偏移因子（范围 1.0-5.0，默认 1.0）。当 != 1.0 时，对时间步应用 `t = shift * t / (1 + (shift - 1) * t)`。turbo 模型推荐 3.0。|
+| `infer_method` | `str` | `"ode"` | 扩散推理方法。`"ode"`（Euler）更快且确定性。`"sde"`（随机）可能产生不同的带方差结果。|
+| `timesteps` | `Optional[List[float]]` | `None` | 自定义时间步，从 1.0 到 0.0 的浮点数列表（例如 `[0.97, 0.76, 0.615, 0.5, 0.395, 0.28, 0.18, 0.085, 0]`）。如果提供，覆盖 `inference_steps` 和 `shift`。|
+### 任务特定参数
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `task_type` | `str` | `"text2music"` | 生成任务类型。详见[任务类型](#任务类型)部分。|
+| `instruction` | `str` | `"Fill the audio semantic mask based on the given conditions:"` | 任务特定指令提示。|
+| `reference_audio` | `Optional[str]` | `None` | 用于风格迁移或续写任务的参考音频文件路径。|
+| `src_audio` | `Optional[str]` | `None` | 用于音频到音频任务（cover、repaint 等）的源音频文件路径。|
+| `audio_codes` | `str` | `""` | 预提取的 5Hz 音频语义代码字符串。仅供高级使用。|
+| `repainting_start` | `float` | `0.0` | 重绘开始时间（秒）（用于 repaint/lego 任务）。|
+| `repainting_end` | `float` | `-1` | 重绘结束时间（秒）。使用 `-1` 表示音频末尾。|
+| `audio_cover_strength` | `float` | `1.0` | 音频 cover/代码影响强度（0.0-1.0）。风格迁移任务设置较小值（0.2）。|
+### 5Hz 语言模型参数
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `thinking` | `bool` | `True` | 启用 5Hz 语言模型"思维链"推理用于语义/音乐元数据和代码。|
+| `lm_temperature` | `float` | `0.85` | LM 采样温度（0.0-2.0）。越高 = 更有创意/多样，越低 = 更保守。|
+| `lm_cfg_scale` | `float` | `2.0` | LM 无分类器引导比例。越高 = 更强的提示遵循度。|
+| `lm_top_k` | `int` | `0` | LM top-k 采样。`0` 禁用 top-k 过滤。典型值：40-100。|
+| `lm_top_p` | `float` | `0.9` | LM 核采样（0.0-1.0）。`1.0` 禁用核采样。典型值：0.9-0.95。|
+| `lm_negative_prompt` | `str` | `"NO USER INPUT"` | LM 引导的负面提示。帮助避免不想要的特征。|
+| `use_cot_metas` | `bool` | `True` | 使用 LM CoT 推理生成元数据（BPM、调性、时长等）。|
+| `use_cot_caption` | `bool` | `True` | 使用 LM CoT 推理优化用户 caption。|
+| `use_cot_language` | `bool` | `True` | 使用 LM CoT 推理检测人声语言。|
+| `use_cot_lyrics` | `bool` | `False` | （保留供将来使用）使用 LM CoT 生成/优化歌词。|
+| `use_constrained_decoding` | `bool` | `True` | 启用结构化 LM 输出的约束解码。|
+### CoT 生成的值
+这些字段在启用 CoT 推理时由 LM 自动填充：
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `cot_bpm` | `Optional[int]` | `None` | LM 生成的 BPM 值。|
+| `cot_keyscale` | `str` | `""` | LM 生成的调性。|
+| `cot_timesignature` | `str` | `""` | LM 生成的拍号。|
+| `cot_duration` | `Optional[float]` | `None` | LM 生成的时长。|
+| `cot_vocal_language` | `str` | `"unknown"` | LM 检测的人声语言。|
+| `cot_caption` | `str` | `""` | LM 优化的 caption。|
+| `cot_lyrics` | `str` | `""` | LM 生成/优化的歌词。|
+---
+## GenerationConfig 参数
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `batch_size` | `int` | `2` | 并行生成的样本数量（1-8）。较高的值需要更多 GPU 内存。|
+| `allow_lm_batch` | `bool` | `False` | 允许 LM 批处理。当 `batch_size >= 2` 且 `thinking=True` 时更快。|
+| `use_random_seed` | `bool` | `True` | 是否使用随机种子。`True` 每次不同结果，`False` 可重复结果。|
+| `seeds` | `Optional[List[int]]` | `None` | 批量生成的种子列表。如果提供的种子少于 batch_size，将用随机种子填充。也可以是单个 int。|
+| `lm_batch_chunk_size` | `int` | `8` | 每个 LM 推理块的最大批处理大小（GPU 内存限制）。|
+| `constrained_decoding_debug` | `bool` | `False` | 启用约束解码的调试日志。|
+| `audio_format` | `str` | `"flac"` | 输出音频格式。选项：`"mp3"`、`"wav"`、`"flac"`。默认 FLAC 以快速保存。|
+---
+## 任务类型
+ACE-Step 支持 6 种不同的生成任���类型，每种都针对特定用例进行了优化。
+### 1. Text2Music（默认）
+**目的**：从文本描述和可选元数据生成音乐。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="充满活力的摇滚音乐，电吉他",
+    lyrics="[Instrumental]",  # 或实际歌词
+    bpm=140,
+    duration=30,
+)
+```
+**必需**：
+- `caption` 或 `lyrics`（至少一个）
+**可选但推荐**：
+- `bpm`：控制节奏
+- `keyscale`：控制音乐调性
+- `timesignature`：控制节拍结构
+- `duration`：控制长度
+- `vocal_language`：控制人声特征
+**用例**：
+- 从文本描述生成音乐
+- 从提示创建伴奏
+- 生成带歌词的歌曲
+---
+### 2. Cover
+**目的**：转换现有音频，保持结构但改变风格/音色。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="cover",
+    src_audio="original_song.mp3",
+    caption="爵士钢琴版本",
+    audio_cover_strength=0.8,  # 0.0-1.0
+)
+```
+**必需**：
+- `src_audio`：源音频文件路径
+- `caption`：期望风格/转换的描述
+**可选**：
+- `audio_cover_strength`：控制原始音频的影响
+  - `1.0`：强烈保持原始结构
+  - `0.5`：平衡转换
+  - `0.1`：宽松解读
+- `lyrics`：新歌词（如果要更改人声）
+**用例**：
+- 创建不同风格的翻唱
+- 在保持旋律的同时更改乐器
+- 风格转换
+---
+### 3. Repaint
+**目的**：重新生成音频的特定时间段，保持其余部分不变。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="repaint",
+    src_audio="original.mp3",
+    repainting_start=10.0,  # 秒
+    repainting_end=20.0,    # 秒
+    caption="带钢琴独奏的平滑过渡",
+)
+```
+**必需**：
+- `src_audio`：源音频文件路径
+- `repainting_start`：开始时间（秒）
+- `repainting_end`：结束时间（秒）（使用 `-1` 表示文件末尾）
+- `caption`：重绘部分期望内容的描述
+**用例**：
+- 修复生成音乐的特定部分
+- 为歌曲的某些部分添加变化
+- 创建平滑过渡
+- 替换有问题的片段
+---
+### 4. Lego（仅 Base 模型）
+**目的**：在现有音频的上下文中生成特定乐器轨道。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="lego",
+    src_audio="backing_track.mp3",
+    instruction="Generate the guitar track based on the audio context:",
+    caption="带有蓝调感觉的主音吉他旋律",
+    repainting_start=0.0,
+    repainting_end=-1,
+)
+```
+**必需**：
+- `src_audio`：源/伴奏音频路径
+- `instruction`：必须指定轨道类型（例如"Generate the {TRACK_NAME} track..."）
+- `caption`：期望轨道特征的描述
+**可用轨道**：
+- `"vocals"`、`"backing_vocals"`、`"drums"`、`"bass"`、`"guitar"`、`"keyboard"`、
+- `"percussion"`、`"strings"`、`"synth"`、`"fx"`、`"brass"`、`"woodwinds"`
+**用例**：
+- 添加特定乐器轨道
+- 在伴奏轨道上叠加额外乐器
+- 迭代创建多轨作品
+---
+### 5. Extract（仅 Base 模型）
+**目的**：从混音音频中提取/分离特定乐器轨道。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="extract",
+    src_audio="full_mix.mp3",
+    instruction="Extract the vocals track from the audio:",
+)
+```
+**必需**：
+- `src_audio`：混音音频文件路径
+- `instruction`：必须指定要提取的轨道
+**可用轨道**：与 Lego 任务相同
+**用例**：
+- 音轨分离
+- 分离特定乐器
+- 创建混音
+- 分析单独轨道
+---
+### 6. Complete（仅 Base 模型）
+**目的**：用指定的乐器完成/扩展部分轨道。
+**关键参数**：
+```python
+params = GenerationParams(
+    task_type="complete",
+    src_audio="incomplete_track.mp3",
+    instruction="Complete the input track with drums, bass, guitar:",
+    caption="摇滚风格完成",
+)
+```
+**必需**：
+- `src_audio`：不完整/部分轨道的路径
+- `instruction`：必须指定要添加的轨道
+- `caption`：期望风格的描述
+**用例**：
+- 编排不完整的作品
+- 添加伴奏轨道
+- 自动完成音乐想法
+---
+## 辅助函数
+### understand_music
+分析音频代码以提取音乐元数据。
+```python
+from acestep.inference import understand_music
+result = understand_music(
+    llm_handler=llm_handler,
+    audio_codes="<|audio_code_123|><|audio_code_456|>...",
+    temperature=0.85,
+    use_constrained_decoding=True,
+)
+if result.success:
+    print(f"Caption：{result.caption}")
+    print(f"歌词：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"调性：{result.keyscale}")
+    print(f"时长：{result.duration}s")
+    print(f"语言：{result.language}")
+else:
+    print(f"错误：{result.error}")
+```
+**用例**：
+- 分析现有音乐
+- 从音频代码提取元数据
+- 逆向工程生成参数
+---
+### create_sample
+从自然语言描述生成完整的音乐样本。这是"简单模式"/"灵感模式"功能。
+```python
+from acestep.inference import create_sample
+result = create_sample(
+    llm_handler=llm_handler,
+    query="一首适合安静夜晚的柔和孟加拉情歌",
+    instrumental=False,
+    vocal_language="bn",  # 可选：限制为孟加拉语
+    temperature=0.85,
+)
+if result.success:
+    print(f"Caption：{result.caption}")
+    print(f"歌词：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"时长：{result.duration}s")
+    print(f"调性：{result.keyscale}")
+    print(f"是否纯音乐：{result.instrumental}")
+    # 与 generate_music 一起使用
+    params = GenerationParams(
+        caption=result.caption,
+        lyrics=result.lyrics,
+        bpm=result.bpm,
+        duration=result.duration,
+        keyscale=result.keyscale,
+        vocal_language=result.language,
+    )
+else:
+    print(f"错误：{result.error}")
+```
+**参数**：
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `query` | `str` | 必需 | 期望音乐的自然语言描述 |
+| `instrumental` | `bool` | `False` | 是否生成纯音乐 |
+| `vocal_language` | `Optional[str]` | `None` | 将歌词限制为特定语言（例如"en"、"zh"、"bn"）|
+| `temperature` | `float` | `0.85` | 采样温度 |
+| `top_k` | `Optional[int]` | `None` | Top-k 采样（None 禁用）|
+| `top_p` | `Optional[float]` | `None` | Top-p 采样（None 禁用）|
+| `repetition_penalty` | `float` | `1.0` | 重复惩罚 |
+| `use_constrained_decoding` | `bool` | `True` | 使用基于 FSM 的约束解码 |
+---
+### format_sample
+格式化和增强用户提供的 caption 和 lyrics，生成结构化元数据。
+```python
+from acestep.inference import format_sample
+result = format_sample(
+    llm_handler=llm_handler,
+    caption="拉丁流行，雷鬼音",
+    lyrics="[Verse 1]\nBailando en la noche...",
+    user_metadata={"bpm": 95},  # 可选：约束特定值
+    temperature=0.85,
+)
+if result.success:
+    print(f"增强后的 Caption：{result.caption}")
+    print(f"格式化后的歌词：{result.lyrics}")
+    print(f"BPM：{result.bpm}")
+    print(f"时长：{result.duration}s")
+    print(f"调性：{result.keyscale}")
+    print(f"检测到的语言：{result.language}")
+else:
+    print(f"错误：{result.error}")
+```
+**参数**：
+| 参数 | 类型 | 默认值 | 说明 |
+|-----------|------|---------|-------------|
+| `caption` | `str` | 必需 | 用户的 caption/描述 |
+| `lyrics` | `str` | 必需 | 用户的带结构标签的歌词 |
+| `user_metadata` | `Optional[Dict]` | `None` | 约束特定元数据值（bpm、duration、keyscale、timesignature、language）|
+| `temperature` | `float` | `0.85` | 采样温度 |
+| `top_k` | `Optional[int]` | `None` | Top-k 采样（None 禁用）|
+| `top_p` | `Optional[float]` | `None` | Top-p 采样（None 禁用）|
+| `repetition_penalty` | `float` | `1.0` | 重复惩罚 |
+| `use_constrained_decoding` | `bool` | `True` | 使用基于 FSM 的约束解码 |
+---
+## 完整示例
+### 示例 1：简单文本到音乐生成
+```python
+from acestep.inference import GenerationParams, GenerationConfig, generate_music
+params = GenerationParams(
+    task_type="text2music",
+    caption="宁静的氛围音乐，柔和的钢琴和弦乐",
+    duration=60,
+    bpm=80,
+    keyscale="C Major",
+)
+config = GenerationConfig(
+    batch_size=2,  # 生成 2 个变体
+    audio_format="flac",
+)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+if result.success:
+    for i, audio in enumerate(result.audios, 1):
+        print(f"变体 {i}：{audio['path']}")
+```
+### 示例 2：带歌词的歌曲生成
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="流行民谣，情感人声",
+    lyrics="""Verse 1:
+今天走在街上
+想着你曾说过的话
+一切都变得不同了
+但我会找到自己的路
+Chorus:
+我在前进，我很坚强
+这就是我属于的地方
+""",
+    vocal_language="zh",
+    bpm=72,
+    duration=45,
+)
+config = GenerationConfig(batch_size=1)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### 示例 3：使用自定义时间步
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="复杂和声的爵士融合",
+    # 自定义 9 步调度
+    timesteps=[0.97, 0.76, 0.615, 0.5, 0.395, 0.28, 0.18, 0.085, 0],
+    thinking=True,
+)
+config = GenerationConfig(batch_size=1)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### 示例 4：使用 Shift 参数（Turbo 模型）
+```python
+params = GenerationParams(
+    task_type="text2music",
+    caption="欢快的电子舞曲",
+    inference_steps=8,
+    shift=3.0,  # Turbo 模型推荐
+    infer_method="ode",
+)
+config = GenerationConfig(batch_size=2)
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### 示例 5：使用 create_sample 的简单模式
+```python
+from acestep.inference import create_sample, GenerationParams, GenerationConfig, generate_music
+# 步骤 1：从描述创建样本
+sample = create_sample(
+    llm_handler=llm_handler,
+    query="充满活力的韩国流行舞曲，带有朗朗上口的 Hook",
+    vocal_language="ko",
+)
+if sample.success:
+    # 步骤 2：使用样本生成音乐
+    params = GenerationParams(
+        caption=sample.caption,
+        lyrics=sample.lyrics,
+        bpm=sample.bpm,
+        duration=sample.duration,
+        keyscale=sample.keyscale,
+        vocal_language=sample.language,
+        thinking=True,
+    )
+    config = GenerationConfig(batch_size=2)
+    result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+### 示例 6：格式化和增强用户输入
+```python
+from acestep.inference import format_sample, GenerationParams, GenerationConfig, generate_music
+# 步骤 1：格式化用户输入
+formatted = format_sample(
+    llm_handler=llm_handler,
+    caption="摇滚民谣",
+    lyrics="[Verse]\n在黑暗中我找到了自己的路...",
+)
+if formatted.success:
+    # 步骤 2：使用增强后的输入生成
+    params = GenerationParams(
+        caption=formatted.caption,
+        lyrics=formatted.lyrics,
+        bpm=formatted.bpm,
+        duration=formatted.duration,
+        keyscale=formatted.keyscale,
+        thinking=True,
+        use_cot_metas=False,  # 已格式化，跳过元数据 CoT
+    )
+    config = GenerationConfig(batch_size=2)
+    result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+```
+---
+## 最佳实践
+### 1. Caption 写作
+**好的 Caption**：
+```python
+# 具体且描述性强
+caption="欢快的电子舞曲，重低音和合成器主旋律"
+# 包含情绪和风格
+caption="忧郁的独立民谣，原声吉他和柔和的人声"
+# 指定乐器
+caption="爵士三重奏，钢琴、立式贝斯和刷子鼓"
+```
+**避免**：
+```python
+# 太模糊
+caption="好音乐"
+# 矛盾
+caption="快慢音乐"  # 节奏冲突
+```
+### 2. 参数调优
+**最佳质量**：
+- 使用 base 模型，`inference_steps=64` 或更高
+- 启用 `use_adg=True`
+- 设置 `guidance_scale=7.0-9.0`
+- 设置 `shift=3.0` 以获得更好的时间步分布
+- 使用无损音频格式（`audio_format="wav"`）
+**追求速度**：
+- 使用 turbo 模型，`inference_steps=8`
+- 禁用 ADG（`use_adg=False`）
+- 使用 `infer_method="ode"`（默认）
+- 使用压缩格式（`audio_format="mp3"`）或默认 FLAC
+**一致性**：
+- 在 config 中设置 `use_random_seed=False`
+- 使用固定的 `seeds` 列表或在 params 中使用单个 `seed`
+- 保持较低的 `lm_temperature`（0.7-0.85）
+**多样性**：
+- 在 config 中设置 `use_random_seed=True`
+- 增加 `lm_temperature`（0.9-1.1）
+- 使用 `batch_size > 1` 获得变体
+### 3. 时长指南
+- **纯音乐**：30-180 秒效果良好
+- **带歌词**：推荐自动检测（设置 `duration=-1` 或保持默认）
+- **短片段**：最少 10-20 秒
+- **长格式**：最多 600 秒（10 分钟）
+### 4. LM 使用
+**何时启用 LM（`thinking=True`）**：
+- 需要自动元数据检测
+- 想要 caption 优化
+- 从最少输入生成
+- 需要多样化输出
+**何时禁用 LM（`thinking=False`）**：
+- 已有精确的元数据
+- 需要更快的生成
+- 想要完全控制参数
+### 5. 批处理
+```python
+# 高效批量生成
+config = GenerationConfig(
+    batch_size=8,           # 支持的最大值
+    allow_lm_batch=True,    # 启用以提速（当 thinking=True 时）
+    lm_batch_chunk_size=4,  # 根据 GPU 内存调整
+)
+```
+### 6. 错误处理
+```python
+result = generate_music(dit_handler, llm_handler, params, config, save_dir="/output")
+if not result.success:
+    print(f"生成失败：{result.error}")
+    print(f"状态：{result.status_message}")
+else:
+    # 处理成功结果
+    for audio in result.audios:
+        path = audio['path']
+        key = audio['key']
+        seed = audio['params']['seed']
+        # ... 处理音频文件
+```
+### 7. 内存管理
+对于大批量大小或长时长：
+- 监控 GPU 内存使用
+- 如果出现 OOM 错误，减少 `batch_size`
+- 减少 `lm_batch_chunk_size` 用于 LM 操作
+- 考虑在初始化期间使用 `offload_to_cpu=True`
+---
+## 故障排除
+### 常见问题
+**问题**：内存不足错误
+- **解决方案**：减少 `batch_size`、`inference_steps`，或启用 CPU 卸载
+**问题**：结果质量差
+- **解决方案**：增加 `inference_steps`，调整 `guidance_scale`，使用 base 模型
+**问题**：结果与提示不匹配
+- **解决方案**：使 caption 更具体，增加 `guidance_scale`，启用 LM 优化（`thinking=True`）
+**问题**：生成缓慢
+- **解决方案**：使用 turbo 模型，减少 `inference_steps`，禁用 ADG
+**问题**：LM 不生成代码
+- **解决方案**：验证 `llm_handler` 已初始化，检查 `thinking=True` 和 `use_cot_metas=True`
+**问题**：种子不被尊重
+- **解决方案**：在 config 中设置 `use_random_seed=False` 并提供 `seeds` 列表或在 params 中提供 `seed`
+**问题**：自定义时间步不工作
+- **解决方案**：确保时间步是从 1.0 到 0.0 的浮点数列表，正确排序
+---
+## 版本历史
+- **v1.5.2**：当前版本
+  - 添加了 `shift` 参数用于时间步偏移
+  - 添加了 `infer_method` 参数用于 ODE/SDE 选择
+  - 添加了 `timesteps` 参数用于自定义时间步调度
+  - 添加了 `understand_music()` 函数用于音频分析
+  - 添加了 `create_sample()` 函数用于简单模式生成
+  - 添加了 `format_sample()` 函数用于输入增强
+  - 添加了 `UnderstandResult`、`CreateSampleResult`、`FormatSampleResult` 数据类
+- **v1.5.1**：上一版本
+  - 将 `GenerationConfig` 拆分为 `GenerationParams` 和 `GenerationConfig`
+  - 重命名参数以保持一致性（`key_scale` → `keyscale`、`time_signature` → `timesignature`、`audio_duration` → `duration`、`use_llm_thinking` → `thinking`、`audio_code_string` → `audio_codes`）
+  - 添加了 `instrumental` 参数
+  - 添加了 `use_constrained_decoding` 参数
+  - 添加了 CoT 自动填充字段（`cot_*`）
+  - 将默认 `audio_format` 更改为 "flac"
+  - 将默认 `batch_size` 更改为 2
+  - 将默认 `thinking` 更改为 True
+  - 简化了 `GenerationResult` 结构，统一 `audios` 列表
+  - 在 `extra_outputs` 中添加了统一的 `time_costs`
+- **v1.5**：初始版本
+  - 引入了 `GenerationConfig` 和 `GenerationResult` 数据类
+  - 简化了参数传递
+  - 添加了综合文档
+---
+更多信息，请参阅：
+- 主 README：[`../../README.md`](../../README.md)
+- REST API 文档：[`API.md`](API.md)
+- Gradio 演示指南：[`GRADIO_GUIDE.md`](GRADIO_GUIDE.md)
+- 项目仓库：[ACE-Step-1.5](https://github.com/yourusername/ACE-Step-1.5)