'):
+ think_content = re.search(r'(.*?)', response, flags=re.DOTALL)
+ return think_content.group(1).strip() if think_content else ''
+ return ''
+
+ @staticmethod
+ def _clean_response(response: str) -> str:
+ response = re.sub(r'.*?', '', response, flags=re.DOTALL).strip()
+
+ if response.startswith('```json'):
+ response = response[7:]
+ elif response.startswith('```'):
+ response = response[3:]
+
+ if response.endswith('```'):
+ response = response[:-3]
+
+ return response
+
+ @staticmethod
+ def _create_no_code_result(response_json: dict) -> EvalDetail:
+ result = EvalDetail(metric="LLMCodeCompare")
+ result.status = False
+ result.label = ["NO_CODE.code"]
+ result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ return result
+
+ @staticmethod
+ def _create_normal_result(response_json: dict) -> EvalDetail:
+ result = EvalDetail(metric="LLMCodeCompare")
+ score = response_json.get('score', 0)
+
+ result.status = score != 1
+ tmp_type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL')
+ result.label = [f"{tmp_type}.code"]
+ result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ return result
diff --git a/dingo/model/llm/compare/llm_html_extract_compare.py b/dingo/model/llm/compare/llm_html_extract_compare.py
new file mode 100644
index 00000000..72b9836a
--- /dev/null
+++ b/dingo/model/llm/compare/llm_html_extract_compare.py
@@ -0,0 +1,165 @@
+import json
+import re
+from typing import List
+
+from dingo.io import Data
+from dingo.io.output.eval_detail import EvalDetail
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.response.response_class import ResponseScoreTypeNameReason
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+@Model.llm_register("LLMHtmlExtractCompare")
+class LLMHtmlExtractCompare(BaseOpenAI):
+ prompt = r"""
+ 你是一位专业的 HTML 内容提取评估专家,擅长分析 HTML 代码和 Markdown 文本的转换质量。现在我会提供三段内容:
+
+ 1. **原始网页的 HTML 代码**:这是网页的完整 HTML 结构。
+ 2. **工具A提取的 Markdown 文本**:这是从 HTML 中提取的、适合大语言模型训练的 Markdown 格式文本。
+ 3. **工具B提取的 Markdown 文本**:这是从 HTML 中提取的、适合大语言模型训练的 Markdown 格式文本。
+
+ ⚠️ 注意:工具A与工具B的顺序不是固定的,请不要因为顺序而偏好某一工具,必须客观公正地评估两个工具的实际转换质量。
+
+ 你的任务:
+ 1. 将两个工具提取出来的 Markdown 文本分别与 HTML 代码做对比。严格按以下模块类型检查提取效果:
+
+ **原始HTML元素识别:**
+ - `code`:代码块(`` `` 标签)
+ - `math`:数学公式(MathJax、MathML、LaTeX 格式)
+ - `table`:表格(`` 标签)
+ - `image`:图片(`
` 标签)
+ - `list`:有序/无序列表(`` `` 标签)
+ - `title`:标题(``-`` 标签)
+ - `paragraph`:段落文本(`
` `
` 等文本容器)
+ - `other`:其他(非以上标签的可见内容)
+
+ **Markdown元素统计:**
+ - 代码块:\`\`\`...\`\`\` 或缩进代码
+ - 公式:`$...$` `$$...$$` `\\(...\\)` `\\[...\\]`
+ - 表格:`|...|` 格式
+ - 图片:`` 格式
+ - 列表:`-` `*` `1.` 等标记
+ - 标题:`#` `##` 等标记
+ - 段落:普通文本块
+
+ 2. **评分规则**:评价两个抽取工具的抽取质量,判断哪个工具抽取效果更好。
+ - **抽取完整性**:检查 Markdown 文本是否完整抽取了 HTML 中的关键内容(如代码块、表格、图片、列表等)。
+ - **格式准确性**:检查 Markdown 文本的格式是否正确(如代码块缩进、表格对齐、图片链接等)。
+ - **语义连贯性**:检查 Markdown 文本是否保持了 HTML 内容的语义连贯性(如段落逻辑、标题层次等)。
+
+ 3. **问题反馈**:严格按上述 8 类模块定位问题,若无问题则返回空列表。
+
+ 4. **返回结果**:以 JSON 格式返回,包含3个字段:score、name、reason。
+ - `score`:如果工具A抽取效果更好,score取值为1。如果工具B抽取效果更好,score取值为2。如果工具A和工具B抽取效果基本相同,score取值为0。
+ - `name`:必须从 8 类模块中选择,且选择差异最大的问题模块。
+ - `reason`:客观描述两个工具在该模块的表现差异。
+
+ 示例输出:
+ ```json
+ {{
+ "score": [0|1|2],
+ "name": "[模块类型]",
+ "reason": "[客观描述两个工具在该模块的具体表现差异]"
+ }}
+ ```
+
+ **注意事项**:
+ 1. 禁止使用预定义模块以外的分类。
+ 2. 重点关注结构化内容(代码、表格、公式、图片等)的转换质量。
+ 3. 段落分析需检查文本连贯性和语义完整性。
+
+ ### 原始网页的 HTML 代码如下:
+
+ ```html
+ {}
+ ```
+
+ ### 工具A提取的 Markdown 文本如下:
+
+ ```md
+ {}
+ ```
+
+ ### 工具B提取的 Markdown 文本如下:
+
+ ```md
+ {}
+ ```
+
+
+ 返回结果只有一个 JSON,不要有其他任何解释说明以及分析的信息!
+ """
+
+ @classmethod
+ def build_messages(cls, input_data: Data) -> List:
+ messages = [
+ {
+ "role": "user",
+ "content": cls.prompt.format(
+ input_data.content,
+ input_data.raw_data["magic_md"],
+ input_data.raw_data["content"],
+ ),
+ }
+ ]
+ return messages
+
+ @classmethod
+ def process_response(cls, response: str) -> EvalDetail:
+ log.info(response)
+
+ response_think = ""
+ if response.startswith("
"):
+ think_content = re.search(
+ r"(.*?)", response, flags=re.DOTALL
+ )
+ response_think = think_content.group(1).strip()
+ response = re.sub(r".*?", "", response, flags=re.DOTALL)
+ response = response.strip()
+ if response.startswith("```json"):
+ response = response[7:]
+ if response.startswith("```"):
+ response = response[3:]
+ if response.endswith("```"):
+ response = response[:-3]
+ try:
+ response_json = json.loads(response)
+ response_json["reason"] += "\n"
+ response_json["reason"] += response_think
+ except json.JSONDecodeError:
+ raise ConvertJsonError(f"Convert to JSON format failed: {response}")
+
+ response_model = ResponseScoreTypeNameReason(**response_json)
+
+ result = EvalDetail(metric=cls.__name__)
+ # status
+ if response_model.score != 1:
+ result.status = True
+
+ # type
+ # if response_model.score == 1:
+ # result.type = "TOOL_ONE_BETTER"
+ # if response_model.score == 2:
+ # result.type = "TOOL_TWO_BETTER"
+ # if response_model.score == 0:
+ # result.type = "TOOL_EQUAL"
+ #
+ # # name
+ # result.name = response_model.name
+ #
+ # # reason
+ # result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ tmp_type = ''
+ if response_model.score == 1:
+ tmp_type = "TOOL_ONE_BETTER"
+ if response_model.score == 2:
+ tmp_type = "TOOL_TWO_BETTER"
+ if response_model.score == 0:
+ tmp_type = "TOOL_EQUAL"
+ result.label = [f"{tmp_type}.{response_model.name}"]
+ result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ return result
diff --git a/dingo/model/llm/compare/llm_html_extract_compare_en.py b/dingo/model/llm/compare/llm_html_extract_compare_en.py
new file mode 100644
index 00000000..fae84cc1
--- /dev/null
+++ b/dingo/model/llm/compare/llm_html_extract_compare_en.py
@@ -0,0 +1,137 @@
+import json
+import re
+from typing import List
+
+from dingo.io import Data
+from dingo.io.output.eval_detail import EvalDetail
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.response.response_class import ResponseScoreTypeNameReason
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+@Model.llm_register("LLMHtmlExtractCompareEn")
+class LLMHtmlExtractCompareEn(BaseOpenAI):
+ prompt = r"""
+ You are a professional HTML content extraction evaluator, skilled in analyzing the conversion quality between HTML code and Markdown text. I will provide three pieces of content:
+
+ 1. **Original HTML Code**: The complete HTML structure of the webpage.
+ 2. **Tool A's Extracted Markdown**: Markdown text extracted from HTML, suitable for LLM training.
+ 3. **Tool B's Extracted Markdown**: Markdown text extracted from HTML, suitable for LLM training.
+
+ Note: The order of Tool A and Tool B is not fixed. Do not favor either tool based on order; evaluate objectively based on actual conversion quality.
+
+ Your Task:
+ 1. Compare both Markdown extractions against the HTML code. Strictly check extraction effectiveness for the following 8 module types:
+
+ **HTML Element Identification:**
+ - `code`: Code blocks (, tags)
+ - `math`: Mathematical formulas (MathJax, MathML, LaTeX)
+ - `table`: Tables ( tags)
+ - `image`: Images (
tags)
+ - `list`: Ordered/unordered lists (, tags)
+ - `title`: Headings ( - tags)
+ - `paragraph`: Paragraph text (
,
containers)
+ - `other`: Other visible content not covered above
+
+ **Markdown Element Statistics:**
+ - Code blocks: ```...``` or indented code
+ - Formulas: $...$ $$...$$ \(...\) \[...\]
+ - Tables: |...| format
+ - Images:  format
+ - Lists: -, *, 1. markers
+ - Headings: #, ## markers
+ - Paragraphs: Plain text blocks
+
+ 2. **Scoring Rules**: Evaluate which tool has better extraction quality.
+ - **Extraction Completeness**: Check if key content (code, tables, images, lists) is fully extracted.
+ - **Format Accuracy**: Verify correct Markdown formatting (code indentation, table alignment, image links).
+ - **Semantic Coherence**: Ensure logical flow and heading hierarchy are preserved.
+
+ 3. **Issue Feedback**: Strictly identify problems by the 8 module types above; return empty list if no issues.
+
+ 4. **Return Result**: JSON format with 3 fields: score, name, reason.
+ - `score`: 1 if Tool A is better, 2 if Tool B is better.
+ - `name`: Must be one of the 8 module types, selecting the module with greatest difference.
+ - `reason`: Objective description of performance differences in that module.
+
+ Example Output:
+ {
+ "score": [1|2],
+ "name": "[module_type]",
+ "reason": "[objective description of differences]"
+ }
+ """
+
+ @classmethod
+ def build_messages(cls, input_data: Data) -> List:
+ messages = [
+ {
+ "role": "user",
+ "content": cls.prompt.format(
+ input_data.content,
+ input_data.raw_data["magic_md"],
+ input_data.raw_data["content"],
+ ),
+ }
+ ]
+ return messages
+
+ @classmethod
+ def process_response(cls, response: str) -> EvalDetail:
+ log.info(response)
+
+ response_think = ""
+ if response.startswith("
"):
+ think_content = re.search(
+ r"(.*?)", response, flags=re.DOTALL
+ )
+ response_think = think_content.group(1).strip()
+ response = re.sub(r".*?", "", response, flags=re.DOTALL)
+ response = response.strip()
+ if response.startswith("```json"):
+ response = response[7:]
+ if response.startswith("```"):
+ response = response[3:]
+ if response.endswith("```"):
+ response = response[:-3]
+ try:
+ response_json = json.loads(response)
+ response_json["reason"] += "\n"
+ response_json["reason"] += response_think
+ except json.JSONDecodeError:
+ raise ConvertJsonError(f"Convert to JSON format failed: {response}")
+
+ response_model = ResponseScoreTypeNameReason(**response_json)
+
+ result = EvalDetail(metric=cls.__name__)
+ # status
+ if response_model.score != 1:
+ result.status = True
+
+ # type
+ # if response_model.score == 1:
+ # result.type = "TOOL_ONE_BETTER"
+ # if response_model.score == 2:
+ # result.type = "TOOL_TWO_BETTER"
+ # if response_model.score == 0:
+ # result.type = "TOOL_EQUAL"
+ #
+ # # name
+ # result.name = response_model.name
+ #
+ # # reason
+ # result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ tmp_type = ''
+ if response_model.score == 1:
+ tmp_type = "TOOL_ONE_BETTER"
+ if response_model.score == 2:
+ tmp_type = "TOOL_TWO_BETTER"
+ if response_model.score == 0:
+ tmp_type = "TOOL_EQUAL"
+ result.label = [f"{tmp_type}.{response_model.name}"]
+ result.reason = [json.dumps(response_json, ensure_ascii=False)]
+
+ return result
diff --git a/dingo/model/llm/llm_html_extract_compare_v2.py b/dingo/model/llm/compare/llm_html_extract_compare_v2.py
similarity index 57%
rename from dingo/model/llm/llm_html_extract_compare_v2.py
rename to dingo/model/llm/compare/llm_html_extract_compare_v2.py
index 08d204f6..891ac673 100644
--- a/dingo/model/llm/llm_html_extract_compare_v2.py
+++ b/dingo/model/llm/compare/llm_html_extract_compare_v2.py
@@ -4,10 +4,9 @@
import diff_match_patch as dmp_module
from dingo.io import Data
+from dingo.io.output.eval_detail import EvalDetail
from dingo.model import Model
from dingo.model.llm.base_openai import BaseOpenAI
-from dingo.model.modelres import ModelRes
-from dingo.model.prompt.prompt_html_extract_compare_v2 import PromptHtmlExtractCompareV2
from dingo.model.response.response_class import ResponseNameReason
from dingo.utils import log
@@ -29,7 +28,97 @@ class LLMHtmlExtractCompareV2(BaseOpenAI):
- input_data.raw_data.get("language", "en"): 语言类型 ("zh" 或 "en")
"""
- prompt = PromptHtmlExtractCompareV2
+ prompt = {
+ "content_en": r"""Please compare the following two texts, each extracted from the same webpage using different HTML parsing methods. Your task is to determine whether there is a difference in the core informational content between them.
+
+Guidelines:
+
+Core informational content refers to: main facts, key ideas, central explanations, important data, and the primary textual body of the page.
+
+DO NOT consider the following as core content:
+
+Related questions
+Related topics
+Recommended articles
+"You might also like" sections
+Titles or section headings
+Author names, credentials, affiliations, or bylines
+Reference lists, citations, or bibliographies (e.g., "[1] Smith, J. 2020…")
+Hyperlinks, URLs, or navigation elements (e.g., "Back to homepage", "Related articles", "Next/Previous")
+
+Other autogenerated content
+These elements are considered supplementary and should not influence your assessment of content differences.
+
+You should ignore differences in formatting, word order, or minor stylistic variations unless they affect the actual meaning or presence of important information.
+
+content 1:
+{text_unique_tool_a}
+
+content 2:
+{text_unique_tool_b}
+
+content 3:
+{text_common}
+
+Text A contains content 1 + content 3
+Text B contains content 2 + content 3
+You should focus on the intrinsic logic between the unique content (content 1, content 2) and the common content (content 3) as the crucial basis for judging whether there is significant informational content.
+Explain your reasoning briefly. Then judge the compare result as one of:
+A. Text A contains more core informational content than Text B
+B. Text A contains the same amount of core informational content as Text B
+C. Text A contains less core informational content than Text B
+
+Return the judgment using this format:
+A or B or C
+Please output your thought process first, and then provide your final judgement.
+""",
+ "content_cn": r"""请比较以下两段文本,它们是使用不同的 HTML 解析方法从同一网页中提取的。你的任务是判断这两段文本在核心信息内容上是否存在差异。
+
+评判指南:
+
+"核心信息内容"是指:主要事实、关键信息、核心解释、重要数据以及网页的主要正文内容。
+
+请不要将以下内容视为核心信息:
+
+- 相关问题
+- 相关主题
+- 推荐文章
+- "你可能还喜欢" 类内容
+- 标题或章节标题
+- 作者姓名、资历、机构或署名
+- 参考文献、引用或文献列表
+- 超链接、网址或导航元素
+- 其他自动生成的内容
+- 主题总结
+
+这些元素被视为附加信息,不应影响你对信息差异的判断。
+
+除非会影响实际含义或重要信息的存在,否则请忽略格式、措辞顺序或轻微风格差异。
+
+content 1:
+{text_unique_tool_a}
+
+content 2:
+{text_unique_tool_b}
+
+content 3:
+{text_common}
+
+Text A 由 content 1 + content 3 构成
+Text B 由 content 2 + content 3 构成
+你应重点关注"独有内容(content 1、content 2)"与"共同内容(content 3)"之间的内在逻辑,作为判断是否存在重要信息差异的关键依据。
+
+请简要说明你的推理过程。然后给出如下三种判断之一:
+
+A. Text A 包含的核心信息内容多于 Text B
+B. Text A 与 Text B 包含相同量的核心信息内容
+C. Text A 包含的核心信息内容少于 Text B
+
+请按以下格式返回你的判断:
+A 或 B 或 C
+请首先输出思考过程,最后再输出你的答案。
+"""
+ }
@classmethod
def extract_text_diff(cls, text_a: str, text_b: str, max_diff_length: int = 10000) -> dict:
@@ -91,9 +180,9 @@ def build_messages(cls, input_data: Data) -> List:
# 根据语言选择提示词
if language == "zh":
- prompt_template = cls.prompt.content_cn
+ prompt_template = cls.prompt["content_cn"]
else:
- prompt_template = cls.prompt.content_en
+ prompt_template = cls.prompt["content_en"]
# 填充提示词
prompt_content = prompt_template.format(
@@ -155,22 +244,22 @@ def _parse_response_to_structured(cls, response: str) -> ResponseNameReason:
)
@classmethod
- def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> ModelRes:
+ def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> EvalDetail:
"""
- 将结构化响应转换为 ModelRes 对象
+ 将结构化响应转换为 EvalDetail 对象
映射规则:
- - A -> TOOL_ONE_BETTER (工具A更好,error_status=False)
- - B -> TOOL_EQUAL (两者相同,error_status=False)
- - C -> TOOL_TWO_BETTER (工具B更好,error_status=True)
+ - A -> TOOL_ONE_BETTER (工具A更好,eval_status=False)
+ - B -> TOOL_EQUAL (两者相同,eval_status=False)
+ - C -> TOOL_TWO_BETTER (工具B更好,eval_status=True)
Args:
structured_response: 结构化响应对象,name 字段存储判断结果 (A/B/C)
Returns:
- ModelRes: 评估结果对象
+ EvalDetail: 评估结果对象
"""
- result = ModelRes()
+ result = EvalDetail(metric=cls.__name__)
# 从 name 字段获取判断结果
judgement = structured_response.name
@@ -179,17 +268,17 @@ def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> Mo
judgement_mapping = {
"A": {
"type": "TOOL_ONE_BETTER",
- "error_status": False, # 工具A更好,正常
+ "eval_status": False, # 工具A更好,正常
"description": "工具A提取的信息更完整"
},
"B": {
"type": "TOOL_EQUAL",
- "error_status": False, # 两者相同,正常
+ "eval_status": False, # 两者相同,正常
"description": "两个工具提取的信息量相同"
},
"C": {
"type": "TOOL_TWO_BETTER",
- "error_status": True, # 工具B更好,标记为问题
+ "eval_status": True, # 工具B更好,标记为问题
"description": "工具B提取的信息更完整"
}
}
@@ -198,21 +287,26 @@ def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> Mo
if not mapping:
raise ValueError(f"无效的判断结果: {judgement}")
- result.type = mapping["type"]
- result.error_status = mapping["error_status"]
- result.name = f"Judgement_{judgement}"
+ result.status = mapping["eval_status"]
+ # result.type = mapping["type"]
+ # result.name = f"Judgement_{judgement}"
+ # result.reason = [structured_response.reason]
+
+ tmp_type = mapping["type"]
+ tmp_name = f"Judgement_{judgement}"
+ result.label = [f"{tmp_type}.{tmp_name}"]
result.reason = [structured_response.reason]
return result
@classmethod
- def process_response(cls, response: str) -> ModelRes:
+ def process_response(cls, response: str) -> EvalDetail:
"""
处理 LLM 返回结果
数据流:
1. 原始响应 (str) -> 结构化响应 (ResponseNameReason)
- 2. 结构化响应 -> 评估结果 (ModelRes)
+ 2. 结构化响应 -> 评估结果 (EvalDetail)
这种分层设计的好处:
- 更清晰的责任分离
@@ -224,7 +318,7 @@ def process_response(cls, response: str) -> ModelRes:
response: LLM 原始响应文本
Returns:
- ModelRes: 评估结果对象
+ EvalDetail: 评估结果对象
"""
# 步骤1: 解析为结构化响应
structured_response = cls._parse_response_to_structured(response)
diff --git a/dingo/model/llm/compare/llm_math_compare.py b/dingo/model/llm/compare/llm_math_compare.py
new file mode 100644
index 00000000..014b89cb
--- /dev/null
+++ b/dingo/model/llm/compare/llm_math_compare.py
@@ -0,0 +1,205 @@
+import json
+import re
+from typing import List
+
+from dingo.io import Data
+from dingo.io.output.eval_detail import EvalDetail
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+@Model.llm_register('LLMMathCompare')
+class LLMMathCompare(BaseOpenAI):
+ """
+ 专注于数学公式抽取效果的对比
+ """
+ _metric_info = {
+ 'category': 'Pretrain Text Quality Assessment Metrics',
+ 'metric_name': 'PromptMathCompare',
+ 'description': 'Compares the effectiveness of two tools in extracting mathematical formulas from HTML to Markdown format by evaluating recognition rate and accuracy to determine which tool performs better',
+ 'paper_title': '',
+ 'paper_url': '',
+ 'paper_authors': '',
+ 'evaluation_results': ''
+ }
+
+ prompt = """
+ 你是一位专业的数学公式识别评估专家,擅长分析 HTML 代码和 Markdown 文本中的数学公式。现在我会提供三段内容:
+
+ 1. **裁剪后网页的 HTML 代码**:这是原始网页经过裁剪(去除非必要标签和标签属性)的 HTML 结构。
+ 2. **工具A提取的 Markdown 文本**:这是从 HTML 中提取的、适合大语言模型训练的 Markdown 格式文本。
+ 3. **工具B提取的 Markdown 文本**:这是从 HTML 中提取的、适合大语言模型训练的 Markdown 格式文本。
+
+ ⚠️ 注意:工具A与工具B的顺序不是固定的,请不要因为顺序而偏好某一工具,最终结论必须严格基于流程2统计的数值差异。
+
+ ## 评估流程
+
+ ### 1. 公式数量统计
+
+ **原始HTML公式识别:**
+ - MathJax格式:`\\(` `\\)` `\\[` `\\]` `$$` `$`
+ - MathML标签:`