feat: add custom terms

Huanshere · Huanshere · commit b2b1c22a2d8f · 2024-12-05T14:41:37.000+08:00
diff --git a/README.md b/README.md
@@ -17,19 +17,19 @@ VideoLingo is an all-in-one video translation, localization, and dubbing tool ai
 Key features:
 - 🎥 YouTube video download via yt-dlp
 
-- **🎙️ Word-level subtitle recognition with WhisperX**
+- **🎙️ Word-level and Low-illusion subtitle recognition with WhisperX**
 
-- **📝 NLP and GPT-based subtitle segmentation**
+- **📝 NLP and AI-powered subtitle segmentation**
 
-- **📚 GPT-generated terminology for coherent translation**
+- **📚 Custom + AI-generated terminology for coherent translation**
 
-- **🔄 3-step direct translation, reflection, and adaptation for professional-level quality**
+- **🔄 3-step Translate-Reflect-Adaptation for cinematic quality**
 
-- **✅ Netflix-standard single-line subtitles only**
+- **✅ Netflix-standard, Single-line subtitles Only**
 
-- **🗣️ Dubbing alignment with GPT-SoVITS and other methods**
+- **🗣️ Dubbing with GPT-SoVITS, Azure, OpenAI, and more**
 
-- 🚀 One-click startup and output in Streamlit
+- 🚀 One-click startup and processing in Streamlit
 
 - 📝 Detailed logging with progress resumption
 
diff --git a/core/prompts_storage.py b/core/prompts_storage.py
@@ -37,25 +37,34 @@ def get_split_prompt(sentence, num_parts = 2, word_limit = 20):
 
 ## ================================================================
 # @ step4_1_summarize.py
-def get_summary_prompt(source_content):
+def get_summary_prompt(source_content, custom_terms_json=None):
     src_lang = load_key("whisper.detected_language")
     tgt_lang = load_key("target_language")
+    
+    # add custom terms note
+    terms_note = ""
+    if custom_terms_json:
+        terms_list = []
+        for term in custom_terms_json['terms']:
+            terms_list.append(f"- {term['src']}: {term['tgt']} ({term['note']})")
+        terms_note = "\n### Existing Terms\nPlease exclude these terms in your extraction:\n" + "\n".join(terms_list)
+    
     summary_prompt = f"""
 ### Role
 You are a video translation expert and terminology consultant, specializing in {src_lang} comprehension and {tgt_lang} expression optimization.
 
 ### Task
 For the provided {src_lang} video text:
 1. Summarize main topic in two sentences
-2. Extract professional terms/names with {tgt_lang} translations
-3. Provide brief explanation for each term
+2. Extract professional terms/names with {tgt_lang} translations (excluding existing terms)
+3. Provide brief explanation for each term{terms_note}
 
 ### Steps
 1. Topic Summary:
    - Quick scan for general understanding
    - Write two sentences: first for main topic, second for key point
 2. Term Extraction:
-   - Mark professional terms and names
+   - Mark professional terms and names (excluding those listed in Existing Terms)
    - Provide {tgt_lang} translation or keep original
    - Add brief explanation
    - Keep abbreviations and proper nouns unchanged
diff --git a/core/step4_1_summarize.py b/core/step4_1_summarize.py
@@ -3,9 +3,11 @@
 from core.ask_gpt import ask_gpt
 from core.prompts_storage import get_summary_prompt
 from core.config_utils import load_key
+import pandas as pd
 
 TERMINOLOGY_JSON_PATH = 'output/log/terminology.json'
 SENTENCE_TXT_PATH = 'output/log/sentence_splitbymeaning.txt'
+CUSTOM_TERMS_PATH = 'custom_terms.xlsx'
 
 def combine_chunks():
     """Combine the text chunks identified by whisper into a single long text"""
@@ -33,7 +35,21 @@ def search_things_to_note_in_prompt(sentence):
 
 def get_summary():
     src_content = combine_chunks()
-    summary_prompt = get_summary_prompt(src_content)
+    custom_terms = pd.read_excel(CUSTOM_TERMS_PATH)
+    custom_terms_json = {
+        "terms": [
+            {
+                "src": str(row.iloc[0]),
+                "tgt": str(row.iloc[1]), 
+                "note": str(row.iloc[2])
+            }
+            for _, row in custom_terms.iterrows()
+        ]
+    }
+    if len(custom_terms) > 0:
+        print(f"📖 Custom Terms Loaded: {len(custom_terms)} terms")
+        print("📝 Terms Content:", json.dumps(custom_terms_json, indent=2, ensure_ascii=False))
+    summary_prompt = get_summary_prompt(src_content, custom_terms_json)
     print("📝 Summarizing and extracting terminology ...")
     
     def valid_summary(response_data):
@@ -46,6 +62,8 @@ def valid_summary(response_data):
         return {"status": "success", "message": "Summary completed"}
 
     summary = ask_gpt(summary_prompt, response_json=True, valid_def=valid_summary, log_title='summary')
+    if 'terms' in summary:
+        summary['terms'].extend(custom_terms_json['terms'])
     
     with open(TERMINOLOGY_JSON_PATH, 'w', encoding='utf-8') as f:
         json.dump(summary, f, ensure_ascii=False, indent=4)
diff --git a/custom_terms.xlsx b/custom_terms.xlsx
diff --git a/docs/pages/docs/start.en-US.md b/docs/pages/docs/start.en-US.md
@@ -154,7 +154,7 @@ Before installing VideoLingo, ensure you have installed Git and Anaconda.
 
    ![tutorial](https://github.com/user-attachments/assets/983ba58b-5ae3-4132-90f5-6d48801465dd)
 
-6. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation
+6. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation. To use custom terms, add them to `custom_terms.xlsx` before processing, e.g. `Baguette | French bread | Not just any bread!`.
 
 ## 🏭 Batch Mode (beta)
 
diff --git a/docs/pages/docs/start.zh-CN.md b/docs/pages/docs/start.zh-CN.md
@@ -43,7 +43,7 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 <details>
 <summary>OpenAI 声音怎么选？</summary>
 
-声音列表可以在 [官��](https://platform.openai.com/docs/guides/text-to-speech/voice-options) 找到，例如 `alloy`, `echo`, `nova`等，在 `config.yaml` 中修改 `openai_tts.voice` 即可。
+声音列表可以在 [官网](https://platform.openai.com/docs/guides/text-to-speech/voice-options) 找到，例如 `alloy`, `echo`, `nova`等，在 `config.yaml` 中修改 `openai_tts.voice` 即可。
 
 </details>
 <details>
@@ -89,7 +89,7 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
       vits_weights_path: SoVITS_weights_v2/Huanyu_v2_e10_s150.pth
       ```
    - 参考方法 a，在和 `yaml` 文件同个目录下，放入后续使用的参考音频，命名为 `你喜欢的英文角色名_参考音频的文字内容.wav` 或 `.mp3`，例如 `Huanyuv2_你好，这是一条测试音频.wav`，程序会自动识别并使用。
-   - ⚠️ 警告：**请使用英文命名 `角色名`** ，否则会出现错误。 `���考音频的文字内容` 可以使用中文。目前仍处于测试版，可能产生报错。
+   - ⚠️ 警告：**请使用英文命名 `角色名`** ，否则会出现错误。 `参考音频的文字内容` 可以使用中文。目前仍处于测试版，可能产生报错。
 
 
    ```
@@ -160,7 +160,7 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运
 
    ![tutorial](https://github.com/user-attachments/assets/983ba58b-5ae3-4132-90f5-6d48801465dd)
 
-7. （可选）更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出
+7. （可选）更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出。如需使用自定义术语，请在处理前将术语添加到 `custom_terms.xlsx` 中，例如 `Biden | 登子 | 美国的瞌睡总统`。
 
 ## 🏭 批量模式（beta）
 
diff --git a/i18n/README.zh.md b/i18n/README.zh.md
@@ -19,17 +19,17 @@ VideoLingo 是一站式视频翻译本地化配音工具，能够一键生成 Ne
 主要特点和功能：
 - 🎥 使用 yt-dlp 从 Youtube 链接下载视频
 
-- **🎙️ 使用 WhisperX 进行单词级时间轴字幕识别**
+- **🎙️ 使用 WhisperX 进行单词级和低幻觉字幕识别**
 
-- **📝 使用 NLP 和 GPT 根据句意进行字幕分割**
+- **📝 使用 NLP 和 AI 进行字幕分割**
 
-- **📚 GPT 总结提取术语知识库，上下文连贯翻译**
+- **📚 自定义 + AI 生成术语库，保证翻译连贯性**
 
-- **🔄 三步直译、反思、意译，媲美字幕组精翻效果**
+- **🔄 三步直译、反思、意译，实现影视级翻译质量**
 
 - **✅ 按照 Netflix 标准检查单行长度，绝无双行字幕**
 
-- **🗣️ 使用 GPT-SoVITS 等方法对齐克隆配音**
+- **🗣️ 支持 GPT-SoVITS、Azure、OpenAI 等多种配音方案**
 
 - 🚀 整合包一键启动，在 streamlit 中一键出片