Huanshere
diff --git a/‎.cursorrules‎
Lines changed: 7 additions & 0 deletions b/‎.cursorrules‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎batch/utils/batch_processor.py‎
Lines changed: 2 additions & 3 deletions b/‎batch/utils/batch_processor.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎batch/utils/settings_check.py‎
Lines changed: 1 addition & 2 deletions b/‎batch/utils/settings_check.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎batch/utils/video_processor.py‎
Lines changed: 21 additions & 21 deletions b/‎batch/utils/video_processor.py‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎config.yaml‎
Lines changed: 21 additions & 18 deletions b/‎config.yaml‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎core/step10_gen_audio.py‎ ‎core/_10_gen_audio.py‎core/step10_gen_audio.py renamed to core/_10_gen_audio.py
Lines changed: 10 additions & 16 deletions b/‎core/step10_gen_audio.py‎ ‎core/_10_gen_audio.py‎core/step10_gen_audio.py renamed to core/_10_gen_audio.py
Lines changed: 10 additions & 16 deletions
diff --git a/‎core/step11_merge_full_audio.py‎ ‎core/_11_merge_audio.py‎core/step11_merge_full_audio.py renamed to core/_11_merge_audio.py
Lines changed: 9 additions & 20 deletions b/‎core/step11_merge_full_audio.py‎ ‎core/_11_merge_audio.py‎core/step11_merge_full_audio.py renamed to core/_11_merge_audio.py
Lines changed: 9 additions & 20 deletions
@@ -0,0 +1,7 @@
+2. 使用
+# ------------
+# comment
+# ------------ 
+进行大块的注释
+3. 避免使用复杂的函数内注释，以及函数变量中不要有类型定义
+4. 使用英文注释和print
@@ -40,7 +40,7 @@ RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 
 # Install dependencies
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install -e .
 
 # Set CUDA-related environment variables
 ENV CUDA_HOME=/usr/local/cuda
 
@@ -77,7 +77,7 @@ https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
 ## Installation
 
-You don't have to read the whole docs, [**here**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) is an online AI agent to help you.
+Meet any problem? Chat with our free online AI agent [**here**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) to help you.
 
 > **Note:** For Windows users with NVIDIA GPU, follow these steps before installation:
 > 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
@@ -121,8 +121,8 @@ docker run -d -p 8501:8501 --gpus all videolingo
 
 ## APIs
 VideoLingo supports OpenAI-Like API format and various TTS interfaces:
-- LLM: `claude-3-5-sonnet-20240620`, `deepseek-chat(v3)`, `gemini-2.0-flash-exp`, `gpt-4o`, ... (sorted by performance)
-- WhisperX: Run whisperX locally or use 302.ai API
+- LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (sorted by performance, be cautious with gemini-2.5-flash...)
+- WhisperX: Run whisperX (large-v3) locally or use 302.ai API
 - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(You can modify your own TTS in custom_tts.py!)
 
 > **Note:** VideoLingo works with **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - one API key for all services (LLM, WhisperX, TTS). Or run locally with Ollama and Edge-TTS for free, no API needed!
@@ -133,13 +133,13 @@ For detailed installation, API configuration, and batch mode instructions, pleas
 
 1. WhisperX transcription performance may be affected by video background noise, as it uses wav2vac model for alignment. For videos with loud background music, please enable Voice Separation Enhancement. Additionally, subtitles ending with numbers or special characters may be truncated early due to wav2vac's inability to map numeric characters (e.g., "1") to their spoken form ("one").
 
-2. Using weaker models can lead to errors during intermediate processes due to strict JSON format requirements for responses. If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error.
+2. Using weaker models can lead to errors during processes due to strict JSON format requirements for responses (tried my best to prompt llm😊). If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error.
 
 3. The dubbing feature may not be 100% perfect due to differences in speech rates and intonation between languages, as well as the impact of the translation step. However, this project has implemented extensive engineering processing for speech rates to ensure the best possible dubbing results.
 
 4. **Multilingual video transcription recognition will only retain the main language**. This is because whisperX uses a specialized model for a single language when forcibly aligning word-level subtitles, and will delete unrecognized languages.
 
-5. **Cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable.
+5. **For now, cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable.
 
 ## 📄 License
 
 
@@ -1,9 +1,8 @@
-import os, sys
+import os
 import gc
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 from batch.utils.settings_check import check_settings
 from batch.utils.video_processor import process_video
-from core.config_utils import load_key, update_key
+from core.utils.config_utils import load_key, update_key
 import pandas as pd
 from rich.console import Console
 from rich.panel import Panel
 
@@ -1,5 +1,4 @@
-import os, sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+import os
 import pandas as pd
 from rich.console import Console
 from rich.panel import Panel
 
@@ -1,12 +1,12 @@
-import os, sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
-from st_components.imports_and_utils import *
-from core.onekeycleanup import cleanup
-from core.config_utils import load_key
+import os
+from core.st_utils.imports_and_utils import *
+from core.utils.onekeycleanup import cleanup
+from core.utils import load_key
 import shutil
 from functools import partial
 from rich.panel import Panel
 from rich.console import Console
+from core import *
 
 console = Console()
 
@@ -22,20 +22,20 @@ def process_video(file, dubbing=False, is_retry=False):
 
     text_steps = [
         ("🎥 Processing input file", partial(process_input_file, file)),
-        ("🎙️ Transcribing with Whisper", partial(step2_whisperX.transcribe)),
+        ("🎙️ Transcribing with Whisper", partial(_2_asr.transcribe)),
         ("✂️ Splitting sentences", split_sentences),
         ("📝 Summarizing and translating", summarize_and_translate),
         ("⚡ Processing and aligning subtitles", process_and_align_subtitles),
-        ("🎬 Merging subtitles to video", step7_merge_sub_to_vid.merge_subtitles_to_video),
+        ("🎬 Merging subtitles to video", _7_sub_into_vid.merge_subtitles_to_video),
     ]
 
     if dubbing:
         dubbing_steps = [
             ("🔊 Generating audio tasks", gen_audio_tasks),
-            ("🎵 Extracting reference audio", step9_extract_refer_audio.extract_refer_audio_main),
-            ("🗣️ Generating audio", step10_gen_audio.gen_audio),
-            ("🔄 Merging full audio", step11_merge_full_audio.merge_full_audio),
-            ("🎞️ Merging dubbing to video", step12_merge_dub_to_vid.merge_video_audio),
+            ("🎵 Extracting reference audio", _9_refer_audio.extract_refer_audio_main),
+            ("🗣️ Generating audio", _10_gen_audio.gen_audio),
+            ("🔄 Merging full audio", _11_merge_audio.merge_full_audio),
+            ("🎞️ Merging dubbing to video", _12_dub_to_vid.merge_video_audio),
         ]
         text_steps.extend(dubbing_steps)
 
@@ -78,8 +78,8 @@ def prepare_output_folder(output_folder):
 
 def process_input_file(file):
     if file.startswith('http'):
-        step1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY), cutoff_time=None)
-        video_file = step1_ytdlp.find_video_files()
+        _1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY))
+        video_file = _1_ytdlp.find_video_files()
     else:
         input_file = os.path.join('batch', 'input', file)
         output_file = os.path.join(OUTPUT_DIR, file)
@@ -88,17 +88,17 @@ def process_input_file(file):
     return {'video_file': video_file}
 
 def split_sentences():
-    step3_1_spacy_split.split_by_spacy()
-    step3_2_splitbymeaning.split_sentences_by_meaning()
+    _3_1_split_nlp.split_by_spacy()
+    _3_2_split_meaning.split_sentences_by_meaning()
 
 def summarize_and_translate():
-    step4_1_summarize.get_summary()
-    step4_2_translate_all.translate_all()
+    _4_1_summarize.get_summary()
+    _4_2_translate.translate_all()
 
 def process_and_align_subtitles():
-    step5_splitforsub.split_for_sub_main()
-    step6_generate_final_timeline.align_timestamp_main()
+    _5_split_sub.split_for_sub_main()
+    _6_gen_sub.align_timestamp_main()
 
 def gen_audio_tasks():
-    step8_1_gen_audio_task.gen_audio_task_main()
-    step8_2_gen_dub_chunks.gen_dub_chunks()
+    _8_1_audio_task.gen_audio_task_main()
+    _8_2_dub_chunks.gen_dub_chunks()
@@ -1,14 +1,22 @@
 # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
 # recommend to set in streamlit page
-version: "2.2.3"
+# -------------------
+# version: "3.0.0"
+# author: "Huanshere"
+# -------------------
+
 ## ======================== Basic Settings ======================== ##
+
 display_language: "zh-CN"
 
 # API settings
 api:
-  key: 'your_api_key'
-  base_url: 'https://api.302.ai'
-  model: 'gemini-2.0-flash'
+  key: 'your-api-key'
+  base_url: 'https://yunwu.ai'
+  model: 'gpt-4.1-2025-04-14'
+  llm_support_json: false
+# *Number of LLM multi-threaded accesses, set to 1 if using local LLM
+max_workers: 4
 
 # Language settings, written into the prompt, can be described in natural language
 target_language: '简体中文'
@@ -17,22 +25,25 @@ target_language: '简体中文'
 demucs: true
 
 whisper:
-  # ["medium", "large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3
+  # ["large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3
   model: 'large-v3'
-  # Whisper specified recognition language [en, zh, ...]
+  # Whisper specified recognition language ISO 639-1
   language: 'en'
   detected_language: 'en'
   # Whisper running mode ["local", "cloud", "elevenlabs"]. Specifies where to run, cloud uses 302.ai API
   runtime: 'local'
   # 302.ai API key
   whisperX_302_api_key: 'your_302_api_key'
-  # ElevenLabs API key
+  # ElevenLabs API key (experimental)
   elevenlabs_api_key: 'your_elevenlabs_api_key'
 
 # Whether to burn subtitles into the video
 burn_subtitles: true
 
 ## ======================== Advanced Settings ======================== ##
+# *🔬 h264_nvenc GPU acceleration for ffmpeg, make sure your GPU supports it
+ffmpeg_gpu: false
+
 # *Youtube settings
 youtube:
   cookies_path: ''
@@ -49,8 +60,6 @@ subtitle:
 # *Summary length, set low to 2k if using local LLM
 summary_length: 8000
 
-# *Number of LLM multi-threaded accesses, set to 1 if using local LLM
-max_workers: 4
 # *Maximum number of words for the first rough cut, below 18 will cut too finely affecting translation, above 22 is too long and will make subsequent subtitle splitting difficult to align
 max_split_length: 20
 
@@ -62,7 +71,7 @@ pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
 # TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
-tts_method: 'f5tts'
+tts_method: 'azure_tts'
 
 # SiliconFlow FishTTS
 sf_fish_tts:
@@ -125,7 +134,8 @@ tolerance: 1.5 # Allowed extension time to the next subtitle
 
 
 
-## ======================== Additional settings 请勿修改======================== ##
+## ======================== Additional settings ======================== ##
+
 # Whisper model directory
 model_dir: './_model_cache'
 
@@ -145,13 +155,6 @@ allowed_audio_formats:
 - 'flac'
 - 'm4a'
 
-# LLMs that support returning JSON format
-llm_support_json:
-- 'gpt-4o'
-- 'gpt-4o-mini'
-- 'gemini-2.0-flash'
-- 'deepseek-chat'
-
 # Spacy models
 spacy_model_map:
   en: 'en_core_web_md'
 
@@ -1,30 +1,24 @@
 import os
-import sys
 import time
 import shutil
 import subprocess
 from typing import Tuple
 
 import pandas as pd
 from pydub import AudioSegment
-from rich import print as rprint
 from rich.console import Console
 from rich.progress import Progress
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core.config_utils import load_key
-from core.all_whisper_methods.audio_preprocess import get_audio_duration
-from core.all_tts_functions.tts_main import tts_main
+from core.utils import *
+from core.utils.models import *
+from core.asr_backend.audio_preprocess import get_audio_duration
+from core.tts_backend.tts_main import tts_main
 
 console = Console()
 
-TEMP_DIR = 'output/audio/tmp'
-SEGS_DIR = 'output/audio/segs'
-TASKS_FILE = "output/audio/tts_tasks.xlsx"
-OUTPUT_FILE = "output/audio/tts_tasks.xlsx"
-TEMP_FILE_TEMPLATE = f"{TEMP_DIR}/{{}}_temp.wav"
-OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav"
+TEMP_FILE_TEMPLATE = f"{_AUDIO_TMP_DIR}/{{}}_temp.wav"
+OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav"
 WARMUP_SIZE = 5
 
 def parse_df_srt_time(time_str: str) -> float:
@@ -217,11 +211,11 @@ def gen_audio() -> None:
     rprint("[bold magenta]🚀 Starting audio generation process...[/bold magenta]")
 
     # 🎯 Step1: Create necessary directories
-    os.makedirs(TEMP_DIR, exist_ok=True)
-    os.makedirs(SEGS_DIR, exist_ok=True)
+    os.makedirs(_AUDIO_TMP_DIR, exist_ok=True)
+    os.makedirs(_AUDIO_SEGS_DIR, exist_ok=True)
 
     # 📝 Step2: Load task file
-    tasks_df = pd.read_excel(TASKS_FILE)
+    tasks_df = pd.read_excel(_8_1_AUDIO_TASK)
     rprint("[green]📊 Loaded task file successfully[/green]")
 
     # 🔊 Step3: Generate TTS audio
@@ -231,7 +225,7 @@ def gen_audio() -> None:
     tasks_df = merge_chunks(tasks_df)
 
     # 💾 Step5: Save results
-    tasks_df.to_excel(OUTPUT_FILE, index=False)
+    tasks_df.to_excel(_8_1_AUDIO_TASK, index=False)
     rprint("[bold green]🎉 Audio generation completed successfully![/bold green]")
 
 if __name__ == "__main__":
 
@@ -1,19 +1,17 @@
-import sys, os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import os
 import pandas as pd
 import subprocess
 from pydub import AudioSegment
-from rich import print as rprint
 from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
 from rich.console import Console
+from core.utils import *
+from core.utils.models import *
 console = Console()
 
-INPUT_EXCEL = 'output/audio/tts_tasks.xlsx'
 DUB_VOCAL_FILE = 'output/dub.mp3'
 
 DUB_SUB_FILE = 'output/dub.srt'
-SEGS_DIR = 'output/audio/segs'
-OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav"
+OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav"
 
 def load_and_flatten_data(excel_file):
     """Load and flatten Excel data"""
@@ -45,7 +43,7 @@ def process_audio_segment(audio_file):
         '-i', audio_file,
         '-ar', '16000',
         '-ac', '1',
-        '-b:a', '128k',
+        '-b:a', '64k',
         temp_file
     ]
     subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -56,12 +54,7 @@ def process_audio_segment(audio_file):
 def merge_audio_segments(audios, new_sub_times, sample_rate):
     merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate)
 
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        BarColumn(),
-        TaskProgressColumn(),
-    ) as progress:
+    with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn()) as progress:
         merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios))
 
         for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)):
@@ -90,7 +83,7 @@ def merge_audio_segments(audios, new_sub_times, sample_rate):
     return merged_audio
 
 def create_srt_subtitle():
-    df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL)
+    df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK)
 
     with open(DUB_SUB_FILE, 'w', encoding='utf-8') as f:
         for i, ((start_time, end_time), line) in enumerate(zip(new_sub_times, lines), 1):
@@ -108,7 +101,7 @@ def merge_full_audio():
     console.print("\n[bold cyan]🎬 Starting audio merging process...[/bold cyan]")
 
     with console.status("[bold cyan]📊 Loading data from Excel...[/bold cyan]"):
-        df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL)
+        df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK)
     console.print("[bold green]✅ Data loaded successfully[/bold green]")
 
     with console.status("[bold cyan]🔍 Getting audio file list...[/bold cyan]"):
@@ -130,11 +123,7 @@ def merge_full_audio():
 
     with console.status("[bold cyan]💾 Exporting final audio file...[/bold cyan]"):
         merged_audio = merged_audio.set_frame_rate(16000).set_channels(1)
-        merged_audio.export(
-            DUB_VOCAL_FILE, 
-            format="mp3",
-            parameters=["-b:a", "64k"]
-        )
+        merged_audio.export(DUB_VOCAL_FILE, format="mp3", parameters=["-b:a", "64k"])
     console.print(f"[bold green]✅ Audio file successfully merged![/bold green]")
     console.print(f"[bold green]📁 Output file: {DUB_VOCAL_FILE}[/bold green]")