Merge pull request #399 from Huanshere/add-cosyvoice

Huanshere · web-flow · commit b9e2271b1de6 · 2025-02-16T10:35:33.000+08:00
Add cosyvoice
diff --git a/README.md b/README.md
@@ -41,16 +41,23 @@ Difference from similar projects: **Single-line subtitles only, superior transla
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### Russian Translation
+### Dual Subtitles
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
 
-### GPT-SoVITS Dubbing
+### Cosy2 Voice Clone
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
+
+### GPT-SoVITS with my voice
 ---
 https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
diff --git a/config.yaml b/config.yaml
@@ -1,13 +1,13 @@
 # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
-version: "2.2.0"
+version: "2.2.1"
 ## ======================== Basic Settings ======================== ##
 display_language: "zh-CN"
 
 # API settings
 api:
   key: 'YOUR_API_KEY'
-  base_url: 'https://api.302.ai'
-  model: 'deepseek-chat'
+  base_url: 'https://api.302.ai/'
+  model: 'deepseek-coder'
 
 # Language settings, written into the prompt, can be described in natural language
 target_language: '简体中文'
@@ -22,7 +22,7 @@ whisper:
   language: 'en'
   detected_language: 'en'
   # Whisper running mode ["local", "cloud"]. Specifies where to run, cloud uses 302.ai API
-  runtime: 'cloud'
+  runtime: 'local'
   # 302.ai API key
   whisperX_302_api_key: 'YOUR_302_API_KEY'
 
@@ -55,7 +55,7 @@ pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
 # TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
-tts_method: 'azure_tts'
+tts_method: 'sf_cosyvoice2'
 
 # SiliconFlow FishTTS
 sf_fish_tts:
@@ -87,6 +87,10 @@ fish_tts:
     'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1'
     '丁真': '54a5170264694bfc8e9ad98df7bd89c3'
 
+# SiliconFlow CosyVoice2 Clone
+sf_cosyvoice2:
+  api_key: 'YOUR_SF_KEY'
+
 # Edge TTS configuration
 edge_tts:
   voice: 'zh-CN-XiaoxiaoNeural'
@@ -140,8 +144,6 @@ llm_support_json:
 - 'gpt-4o'
 - 'gpt-4o-mini'
 - 'gemini-2.0-flash-exp'
-- 'deepseek-coder'
-- 'deepseek-chat'
 
 # have problems
 # - 'Qwen/Qwen2.5-72B-Instruct'
diff --git a/core/all_tts_functions/sf_cosyvoice2.py b/core/all_tts_functions/sf_cosyvoice2.py
@@ -0,0 +1,65 @@
+from openai import OpenAI
+from pathlib import Path
+import base64
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+from core.config_utils import load_key
+
+def wav_to_base64(wav_file_path):
+    with open(wav_file_path, 'rb') as audio_file:
+        audio_content = audio_file.read()
+    base64_audio = base64.b64encode(audio_content).decode('utf-8')
+    return base64_audio
+
+def cosyvoice_tts_for_videolingo(text, save_as, number, task_df):
+    """
+    使用 CosyVoice 进行 TTS 转换，支持参考音频
+    """
+    prompt_text = task_df.loc[task_df['number'] == number, 'origin'].values[0]
+    API_KEY = load_key("sf_cosyvoice2.api_key")
+    # 设置参考音频路径
+    current_dir = Path.cwd()
+    ref_audio_path = current_dir / f"output/audio/refers/{number}.wav"
+    
+    # 如果参考音频不存在，使用第一个音频作为备选
+    if not ref_audio_path.exists():
+        ref_audio_path = current_dir / "output/audio/refers/1.wav"
+        if not ref_audio_path.exists():
+            try:
+                from core.step9_extract_refer_audio import extract_refer_audio_main
+                print(f"参考音频文件不存在，尝试提取: {ref_audio_path}")
+                extract_refer_audio_main()
+            except Exception as e:
+                print(f"提取参考音频失败: {str(e)}")
+                raise
+
+    # 转换参考音频为 base64
+    reference_base64 = wav_to_base64(ref_audio_path)
+    
+    client = OpenAI(
+        api_key=API_KEY,
+        base_url="https://api.siliconflow.cn/v1"
+    )
+
+    save_path = Path(save_as)
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with client.audio.speech.with_streaming_response.create(
+        model="FunAudioLLM/CosyVoice2-0.5B",
+        voice="",
+        input=text,
+        response_format="wav",
+        extra_body={
+            "references": [
+                {
+                    "audio": f"data:audio/wav;base64,{reference_base64}",
+                    "text": prompt_text
+                }
+            ]
+        }
+    ) as response:
+        response.stream_to_file(save_path)
+    
+    print(f"音频已成功保存至: {save_path}")
+    return True
diff --git a/core/all_tts_functions/sf_fishtts.py b/core/all_tts_functions/sf_fishtts.py
diff --git a/core/all_tts_functions/tts_main.py b/core/all_tts_functions/tts_main.py
@@ -7,11 +7,12 @@
 from core.config_utils import load_key
 from core.all_whisper_methods.audio_preprocess import get_audio_duration
 from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo
-from core.all_tts_functions.siliconflow_fish_tts import siliconflow_fish_tts_for_videolingo
+from core.all_tts_functions.sf_fishtts import siliconflow_fish_tts_for_videolingo
 from core.all_tts_functions.openai_tts import openai_tts
 from core.all_tts_functions.fish_tts import fish_tts
 from core.all_tts_functions.azure_tts import azure_tts
 from core.all_tts_functions.edge_tts import edge_tts
+from core.all_tts_functions.sf_cosyvoice2 import cosyvoice_tts_for_videolingo
 from core.all_tts_functions.custom_tts import custom_tts
 from core.ask_gpt import ask_gpt
 from core.prompts_storage import get_correct_text_prompt
@@ -61,6 +62,8 @@ def tts_main(text, save_as, number, task_df):
                 edge_tts(text, save_as)
             elif TTS_METHOD == 'custom_tts':
                 custom_tts(text, save_as)
+            elif TTS_METHOD == 'sf_cosyvoice2':
+                cosyvoice_tts_for_videolingo(text, save_as, number, task_df)
             
             # Check generated audio duration
             duration = get_audio_duration(save_as)
diff --git a/core/all_whisper_methods/whisperX_302.py b/core/all_whisper_methods/whisperX_302.py
@@ -49,12 +49,25 @@ def transcribe_audio_302(audio_path: str, start: float = None, end: float = None
         'Authorization': f'Bearer {load_key("whisper.whisperX_302_api_key")}'
     }
 
-    response = requests.request("POST", url, headers=headers, data=payload, files=files)
+    # 使用 with 语句确保文件正确关闭
+    with open(audio_path, 'rb') as audio_file:
+        files = [
+            ('audio_input', (
+                os.path.basename(audio_path),
+                audio_file,
+                'application/octet-stream'
+            ))
+        ]
+        response = requests.request("POST", url, headers=headers, data=payload, files=files)
     
     # 清理临时文件
     if start is not None and end is not None:
         if os.path.exists(temp_audio_path):
-            os.unlink(temp_audio_path)
+            time.sleep(0.1)
+            try:
+                os.unlink(temp_audio_path)
+            except PermissionError:
+                print(f"警告：无法删除临时文件 {temp_audio_path}")
     
     with open(LOG_FILE, "w", encoding="utf-8") as f:
         json.dump(response.json(), f, indent=4, ensure_ascii=False)
diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py
@@ -80,7 +80,7 @@ def page_setting():
             update_key("burn_subtitles", burn_subtitles)
             st.rerun()
     with st.expander(t("Dubbing Settings"), expanded=True):
-        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts"]
+        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2"]
         select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if select_tts != load_key("tts_method"):
             update_key("tts_method", select_tts)
@@ -138,9 +138,13 @@ def page_setting():
             if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
                 update_key("gpt_sovits.refer_mode", selected_refer_mode)
                 st.rerun()
+                
         elif select_tts == "edge_tts":
             config_input(t("Edge TTS Voice"), "edge_tts.voice")
 
+        elif select_tts == "sf_cosyvoice2":
+            config_input(t("SiliconFlow API Key"), "sf_cosyvoice2.api_key")
+        
 def check_api():
     try:
         resp = ask_gpt("This is a test, response 'message':'success' in json format.", 
diff --git a/translations/README.es.md b/translations/README.es.md
@@ -41,16 +41,23 @@ Diferencia con proyectos similares: **Solo subtítulos de una línea, calidad su
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### Traducción al Ruso
+### Subtítulos Duales
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
 
-### Doblaje GPT-SoVITS
+### Clonación de Voz Cosy2
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
+
+### GPT-SoVITS con mi voz
 ---
 https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
diff --git a/translations/README.fr.md b/translations/README.fr.md
@@ -41,16 +41,23 @@ Différence par rapport aux projets similaires : **Sous-titres sur une seule lig
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### Traduction en russe
+### Sous-titres Doubles
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
 
-### Doublage GPT-SoVITS
+### Clonage Vocal Cosy2
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
+
+### GPT-SoVITS avec ma voix
 ---
 https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
diff --git a/translations/README.ja.md b/translations/README.ja.md
@@ -41,16 +41,23 @@ VideoLingoは、Netflixクオリティの字幕を生成することを目的と
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### ロシア語翻訳
+### デュアル字幕
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
 
-### GPT-SoVITS吹き替え
+### Cosy2 ボイスクローン
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
+
+### GPT-SoVITS 吹き替え
 ---
 https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
diff --git a/translations/README.ru.md b/translations/README.ru.md
@@ -41,16 +41,23 @@ VideoLingo - это универсальный инструмент для пе
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### Перевод на русский
+### Двойные Субтитры
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
 
-### Дубляж GPT-SoVITS
+### Клонирование Голоса Cosy2
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
+
+### GPT-SoVITS с моим голосом
 ---
 https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
diff --git a/translations/README.zh-TW.md b/translations/README.zh-TW.md
@@ -41,14 +41,21 @@ VideoLingo 是一個全方位的影片翻譯、本地化和配音工具，旨在
 
 <table>
 <tr>
-<td width="50%">
+<td width="33%">
 
-### 俄語翻譯
+### 雙語字幕
 ---
-https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
+https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
 
 </td>
-<td width="50%">
+<td width="33%">
+
+### Cosy2 聲音克隆
+---
+https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
+
+</td>
+<td width="33%">
 
 ### GPT-SoVITS 配音
 ---
diff --git a/translations/README.zh.md b/translations/README.zh.md