Skip to content

Commit b9e2271

Browse files
authored
Merge pull request #399 from Huanshere/add-cosyvoice
Add cosyvoice
2 parents 9219a91 + c935283 commit b9e2271

File tree

13 files changed

+182
-46
lines changed

13 files changed

+182
-46
lines changed

README.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,23 @@ Difference from similar projects: **Single-line subtitles only, superior transla
4141

4242
<table>
4343
<tr>
44-
<td width="50%">
44+
<td width="33%">
4545

46-
### Russian Translation
46+
### Dual Subtitles
4747
---
48-
https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
48+
https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
4949

5050
</td>
51-
<td width="50%">
51+
<td width="33%">
5252

53-
### GPT-SoVITS Dubbing
53+
### Cosy2 Voice Clone
54+
---
55+
https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
56+
57+
</td>
58+
<td width="33%">
59+
60+
### GPT-SoVITS with my voice
5461
---
5562
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
5663

config.yaml

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
2-
version: "2.2.0"
2+
version: "2.2.1"
33
## ======================== Basic Settings ======================== ##
44
display_language: "zh-CN"
55

66
# API settings
77
api:
88
key: 'YOUR_API_KEY'
9-
base_url: 'https://api.302.ai'
10-
model: 'deepseek-chat'
9+
base_url: 'https://api.302.ai/'
10+
model: 'deepseek-coder'
1111

1212
# Language settings, written into the prompt, can be described in natural language
1313
target_language: '简体中文'
@@ -22,7 +22,7 @@ whisper:
2222
language: 'en'
2323
detected_language: 'en'
2424
# Whisper running mode ["local", "cloud"]. Specifies where to run, cloud uses 302.ai API
25-
runtime: 'cloud'
25+
runtime: 'local'
2626
# 302.ai API key
2727
whisperX_302_api_key: 'YOUR_302_API_KEY'
2828

@@ -55,7 +55,7 @@ pause_before_translate: false
5555

5656
## ======================== Dubbing Settings ======================== ##
5757
# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
58-
tts_method: 'azure_tts'
58+
tts_method: 'sf_cosyvoice2'
5959

6060
# SiliconFlow FishTTS
6161
sf_fish_tts:
@@ -87,6 +87,10 @@ fish_tts:
8787
'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1'
8888
'丁真': '54a5170264694bfc8e9ad98df7bd89c3'
8989

90+
# SiliconFlow CosyVoice2 Clone
91+
sf_cosyvoice2:
92+
api_key: 'YOUR_SF_KEY'
93+
9094
# Edge TTS configuration
9195
edge_tts:
9296
voice: 'zh-CN-XiaoxiaoNeural'
@@ -140,8 +144,6 @@ llm_support_json:
140144
- 'gpt-4o'
141145
- 'gpt-4o-mini'
142146
- 'gemini-2.0-flash-exp'
143-
- 'deepseek-coder'
144-
- 'deepseek-chat'
145147

146148
# have problems
147149
# - 'Qwen/Qwen2.5-72B-Instruct'
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from openai import OpenAI
2+
from pathlib import Path
3+
import base64
4+
import os
5+
import sys
6+
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
7+
from core.config_utils import load_key
8+
9+
def wav_to_base64(wav_file_path):
10+
with open(wav_file_path, 'rb') as audio_file:
11+
audio_content = audio_file.read()
12+
base64_audio = base64.b64encode(audio_content).decode('utf-8')
13+
return base64_audio
14+
15+
def cosyvoice_tts_for_videolingo(text, save_as, number, task_df):
16+
"""
17+
使用 CosyVoice 进行 TTS 转换,支持参考音频
18+
"""
19+
prompt_text = task_df.loc[task_df['number'] == number, 'origin'].values[0]
20+
API_KEY = load_key("sf_cosyvoice2.api_key")
21+
# 设置参考音频路径
22+
current_dir = Path.cwd()
23+
ref_audio_path = current_dir / f"output/audio/refers/{number}.wav"
24+
25+
# 如果参考音频不存在,使用第一个音频作为备选
26+
if not ref_audio_path.exists():
27+
ref_audio_path = current_dir / "output/audio/refers/1.wav"
28+
if not ref_audio_path.exists():
29+
try:
30+
from core.step9_extract_refer_audio import extract_refer_audio_main
31+
print(f"参考音频文件不存在,尝试提取: {ref_audio_path}")
32+
extract_refer_audio_main()
33+
except Exception as e:
34+
print(f"提取参考音频失败: {str(e)}")
35+
raise
36+
37+
# 转换参考音频为 base64
38+
reference_base64 = wav_to_base64(ref_audio_path)
39+
40+
client = OpenAI(
41+
api_key=API_KEY,
42+
base_url="https://api.siliconflow.cn/v1"
43+
)
44+
45+
save_path = Path(save_as)
46+
save_path.parent.mkdir(parents=True, exist_ok=True)
47+
48+
with client.audio.speech.with_streaming_response.create(
49+
model="FunAudioLLM/CosyVoice2-0.5B",
50+
voice="",
51+
input=text,
52+
response_format="wav",
53+
extra_body={
54+
"references": [
55+
{
56+
"audio": f"data:audio/wav;base64,{reference_base64}",
57+
"text": prompt_text
58+
}
59+
]
60+
}
61+
) as response:
62+
response.stream_to_file(save_path)
63+
64+
print(f"音频已成功保存至: {save_path}")
65+
return True
File renamed without changes.

core/all_tts_functions/tts_main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
from core.config_utils import load_key
88
from core.all_whisper_methods.audio_preprocess import get_audio_duration
99
from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo
10-
from core.all_tts_functions.siliconflow_fish_tts import siliconflow_fish_tts_for_videolingo
10+
from core.all_tts_functions.sf_fishtts import siliconflow_fish_tts_for_videolingo
1111
from core.all_tts_functions.openai_tts import openai_tts
1212
from core.all_tts_functions.fish_tts import fish_tts
1313
from core.all_tts_functions.azure_tts import azure_tts
1414
from core.all_tts_functions.edge_tts import edge_tts
15+
from core.all_tts_functions.sf_cosyvoice2 import cosyvoice_tts_for_videolingo
1516
from core.all_tts_functions.custom_tts import custom_tts
1617
from core.ask_gpt import ask_gpt
1718
from core.prompts_storage import get_correct_text_prompt
@@ -61,6 +62,8 @@ def tts_main(text, save_as, number, task_df):
6162
edge_tts(text, save_as)
6263
elif TTS_METHOD == 'custom_tts':
6364
custom_tts(text, save_as)
65+
elif TTS_METHOD == 'sf_cosyvoice2':
66+
cosyvoice_tts_for_videolingo(text, save_as, number, task_df)
6467

6568
# Check generated audio duration
6669
duration = get_audio_duration(save_as)

core/all_whisper_methods/whisperX_302.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,25 @@ def transcribe_audio_302(audio_path: str, start: float = None, end: float = None
4949
'Authorization': f'Bearer {load_key("whisper.whisperX_302_api_key")}'
5050
}
5151

52-
response = requests.request("POST", url, headers=headers, data=payload, files=files)
52+
# 使用 with 语句确保文件正确关闭
53+
with open(audio_path, 'rb') as audio_file:
54+
files = [
55+
('audio_input', (
56+
os.path.basename(audio_path),
57+
audio_file,
58+
'application/octet-stream'
59+
))
60+
]
61+
response = requests.request("POST", url, headers=headers, data=payload, files=files)
5362

5463
# 清理临时文件
5564
if start is not None and end is not None:
5665
if os.path.exists(temp_audio_path):
57-
os.unlink(temp_audio_path)
66+
time.sleep(0.1)
67+
try:
68+
os.unlink(temp_audio_path)
69+
except PermissionError:
70+
print(f"警告:无法删除临时文件 {temp_audio_path}")
5871

5972
with open(LOG_FILE, "w", encoding="utf-8") as f:
6073
json.dump(response.json(), f, indent=4, ensure_ascii=False)

st_components/sidebar_setting.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def page_setting():
8080
update_key("burn_subtitles", burn_subtitles)
8181
st.rerun()
8282
with st.expander(t("Dubbing Settings"), expanded=True):
83-
tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts"]
83+
tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2"]
8484
select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method")))
8585
if select_tts != load_key("tts_method"):
8686
update_key("tts_method", select_tts)
@@ -138,9 +138,13 @@ def page_setting():
138138
if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
139139
update_key("gpt_sovits.refer_mode", selected_refer_mode)
140140
st.rerun()
141+
141142
elif select_tts == "edge_tts":
142143
config_input(t("Edge TTS Voice"), "edge_tts.voice")
143144

145+
elif select_tts == "sf_cosyvoice2":
146+
config_input(t("SiliconFlow API Key"), "sf_cosyvoice2.api_key")
147+
144148
def check_api():
145149
try:
146150
resp = ask_gpt("This is a test, response 'message':'success' in json format.",

translations/README.es.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,23 @@ Diferencia con proyectos similares: **Solo subtítulos de una línea, calidad su
4141

4242
<table>
4343
<tr>
44-
<td width="50%">
44+
<td width="33%">
4545

46-
### Traducción al Ruso
46+
### Subtítulos Duales
4747
---
48-
https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
48+
https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
4949

5050
</td>
51-
<td width="50%">
51+
<td width="33%">
5252

53-
### Doblaje GPT-SoVITS
53+
### Clonación de Voz Cosy2
54+
---
55+
https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
56+
57+
</td>
58+
<td width="33%">
59+
60+
### GPT-SoVITS con mi voz
5461
---
5562
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
5663

translations/README.fr.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,23 @@ Différence par rapport aux projets similaires : **Sous-titres sur une seule lig
4141

4242
<table>
4343
<tr>
44-
<td width="50%">
44+
<td width="33%">
4545

46-
### Traduction en russe
46+
### Sous-titres Doubles
4747
---
48-
https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
48+
https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
4949

5050
</td>
51-
<td width="50%">
51+
<td width="33%">
5252

53-
### Doublage GPT-SoVITS
53+
### Clonage Vocal Cosy2
54+
---
55+
https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
56+
57+
</td>
58+
<td width="33%">
59+
60+
### GPT-SoVITS avec ma voix
5461
---
5562
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
5663

translations/README.ja.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,23 @@ VideoLingoは、Netflixクオリティの字幕を生成することを目的と
4141

4242
<table>
4343
<tr>
44-
<td width="50%">
44+
<td width="33%">
4545

46-
### ロシア語翻訳
46+
### デュアル字幕
4747
---
48-
https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
48+
https://github.com/user-attachments/assets/3bb51c03-4b5d-4b12-abce-8627873113e6
4949

5050
</td>
51-
<td width="50%">
51+
<td width="33%">
5252

53-
### GPT-SoVITS吹き替え
53+
### Cosy2 ボイスクローン
54+
---
55+
https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
56+
57+
</td>
58+
<td width="33%">
59+
60+
### GPT-SoVITS 吹き替え
5461
---
5562
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
5663

0 commit comments

Comments
 (0)