Skip to content

Commit 80a6b75

Browse files
e06084actions-user
andauthored
feat: optimize label when data is empty in rag metrics (#330)
* feat: optimize label when data is empty in rag metrics * 📚 Auto-update metrics documentation --------- Co-authored-by: GitHub Action <[email protected]>
1 parent 1b01708 commit 80a6b75

File tree

5 files changed

+64
-7
lines changed

5 files changed

+64
-7
lines changed

dingo/model/llm/rag/llm_rag_answer_relevancy.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,7 @@ def build_messages(cls, input_data: Data) -> List:
9090
raw_data = getattr(input_data, 'raw_data', {})
9191
answer = input_data.content or raw_data.get("answer", "")
9292

93-
if not answer:
94-
raise ValueError("Answer Relevancy评估需要answer字段")
93+
# 注意: answer 为空的情况已在 eval() 方法中处理,这里假设 answer 非空
9594

9695
# 使用json.dumps()来安全转义响应字符串
9796
import json
@@ -223,6 +222,19 @@ def calculate_score(cls, answers: List[Dict[str, Any]], original_question: str)
223222
def eval(cls, input_data: Data) -> EvalDetail:
224223
"""评估答案相关性"""
225224
raw_data = getattr(input_data, 'raw_data', {})
225+
226+
# 检查 answer 是否为空
227+
answer = input_data.content or raw_data.get("answer", "")
228+
if not answer:
229+
# 如果 answer 为空,直接返回 0 分
230+
log.warning("Answer Relevancy 评估: answer 字段为空,直接返回 0 分")
231+
result = EvalDetail(metric=cls.__name__)
232+
result.score = 0.0
233+
result.status = True
234+
result.label = ["QUALITY_BAD.ANSWER_RELEVANCY_NO_ANSWER"]
235+
result.reason = ["answer 字段为空,无法评估答案相关性,分数设为 0"]
236+
return result
237+
226238
# 提取原始问题
227239
original_question = input_data.prompt or raw_data.get("question", "")
228240
if not original_question:

dingo/model/llm/rag/llm_rag_context_precision.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,7 @@ def build_messages(cls, input_data: Data) -> List:
166166
question = input_data.prompt or raw_data.get("question", "")
167167
answer = input_data.content or raw_data.get("answer", "")
168168

169-
if not answer:
170-
raise ValueError("Context Precision评估需要answer字段")
169+
# 注意: answer 为空的情况已在 eval() 方法中处理,这里假设 answer 非空
171170

172171
# 处理contexts
173172
contexts = None
@@ -277,6 +276,20 @@ def eval(cls, input_data: Data) -> EvalDetail:
277276
if cls.client is None:
278277
cls.create_client()
279278

279+
# 检查 answer 是否为空
280+
raw_data = getattr(input_data, 'raw_data', {})
281+
answer = input_data.content or raw_data.get("answer", "")
282+
283+
if not answer:
284+
# 如果 answer 为空,直接返回 0 分
285+
log.warning("Context Precision 评估: answer 字段为空,直接返回 0 分")
286+
result = EvalDetail(metric=cls.__name__)
287+
result.score = 0.0
288+
result.status = True
289+
result.label = ["QUALITY_BAD.CONTEXT_PRECISION_NO_ANSWER"]
290+
result.reason = ["answer 字段为空,无法评估上下文精度,分数设为 0"]
291+
return result
292+
280293
# 获取所有上下文的消息
281294
messages_list = cls.build_messages(input_data)
282295
responses = []

dingo/model/llm/rag/llm_rag_context_recall.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,11 @@ def build_messages(cls, input_data: Data) -> List:
145145
else:
146146
contexts = [raw_contexts]
147147

148-
if not expected_output:
149-
raise ValueError("Context Recall评估需要expected_output或answer字段")
150148
if not contexts:
151149
raise ValueError("Context Recall评估需要contexts字段")
152150

151+
# 注意: expected_output 为空的情况已在 eval() 方法中处理,这里假设 expected_output 非空
152+
153153
# 拼接上下文
154154
combined_contexts = "\n\n".join([f"上下文{i + 1}:\n{ctx}" for i, ctx in enumerate(contexts)])
155155

@@ -228,3 +228,29 @@ def process_response(cls, response: str) -> EvalDetail:
228228
result.reason = [f"上下文召回评估未通过 (分数: {score:.2f}/10)\n{reason_text}"]
229229

230230
return result
231+
232+
@classmethod
233+
def eval(cls, input_data: Data) -> EvalDetail:
234+
"""重写父类的eval方法,添加对expected_output的检查"""
235+
if cls.client is None:
236+
cls.create_client()
237+
238+
# 检查 expected_output 或 answer 是否为空
239+
raw_data = getattr(input_data, 'raw_data', {})
240+
expected_output = raw_data.get("expected_output", "")
241+
if not expected_output:
242+
# 如果没有 expected_output,尝试使用 content 或 answer
243+
expected_output = input_data.content or raw_data.get("answer", "")
244+
245+
if not expected_output:
246+
# 如果 expected_output 和 answer 都为空,直接返回 0 分
247+
log.warning("Context Recall 评估: expected_output 和 answer 字段均为空,直接返回 0 分")
248+
result = EvalDetail(metric=cls.__name__)
249+
result.score = 0.0
250+
result.status = True
251+
result.label = ["QUALITY_BAD.CONTEXT_RECALL_NO_REFERENCE"]
252+
result.reason = ["expected_output 和 answer 字段均为空,无法评估上下文召回率,分数设为 0"]
253+
return result
254+
255+
# 调用父类的 eval 方法
256+
return super().eval(input_data)

docs/metrics.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,12 @@ This document provides comprehensive information about all quality metrics used
118118
| `RESUME_QUALITY_BAD_PROFESSIONALISM` | RuleResumeEmoji, RuleResumeInformal | Detects emoji usage in resume which reduces professionalism; Detects informal or colloquial expressions in resume | Internal Implementation | N/A | N/A |
119119
| `RESUME_QUALITY_BAD_STRUCTURE` | RuleResumeNameMissing, RuleResumeSectionMissing | Checks if resume contains a name in the first 200 characters; Checks if resume contains required sections like educat... | Internal Implementation | N/A | N/A |
120120

121+
### SFT Data Assessment Metrics - Agent-Enhanced
122+
123+
| Type | Metric | Description | Paper Source | Evaluation Results | Examples |
124+
|------|--------|-------------|--------------|-------------------|----------|
125+
| `AgentHallucination` | AgentHallucination | Agent-based hallucination detection with automatic web search for missing context | Internal Implementation | N/A | N/A |
126+
121127
### Text Generation
122128

123129
| Type | Metric | Description | Paper Source | Evaluation Results | Examples |

examples/rag/dataset_rag_eval_baseline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
4040
OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
4141
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
42-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
42+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "doubao-embedding-large-text-250515")
4343

4444
# 数据文件路径
4545
INPUT_DATA_PATH = str(PROJECT_ROOT / "test/data/fiqa.jsonl") # 或 "test/data/ragflow_eval_data_50.jsonl"

0 commit comments

Comments
 (0)