diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py index e6d74153..380f49b1 100644 --- a/resources_servers/equivalence_llm_judge/app.py +++ b/resources_servers/equivalence_llm_judge/app.py @@ -21,7 +21,9 @@ # limitations under the License. from __future__ import annotations +import asyncio import re +from contextlib import nullcontext from typing import Any, Optional from fastapi import FastAPI @@ -57,9 +59,10 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig): name: str = "equivalence_llm_judge" judge_model_server: ModelServerRef judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming + judge_endpoint_max_concurrency: Optional[int] = None judge_system_message: Optional[str] = None - judge_prompt_template: str + judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt" judge_equal_label: str = "[[A=B]]" judge_not_equal_label: str = "[[A!=B]]" # Optional regex to extract the question from the last user message. @@ -249,6 +252,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer): config: LLMJudgeResourcesServerConfig + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.config.judge_endpoint_max_concurrency is not None: + self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency) + else: + self._judge_endpoint_max_concurrency = nullcontext() + + with open(self.config.judge_prompt_template_fpath, "r") as f: + self._judge_prompt_template = f.read().strip() + def setup_webserver(self) -> FastAPI: app = super().setup_webserver() return app @@ -419,7 +433,7 @@ async def _generate_judge_evaluation( not_equal_label = cfg.judge_not_equal_label responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True) - prompt_template = cfg.judge_prompt_template + prompt_template = self._judge_prompt_template system_message = cfg.judge_system_message user_prompt = prompt_template.format( @@ -432,12 +446,21 @@ async def _generate_judge_evaluation( msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt)) responses_create_params.input = msgs - response = await self.server_client.post( - server_name=cfg.judge_model_server.name, - url_path="/v1/responses", - json=responses_create_params, - ) - judge_response = NeMoGymResponse.model_validate(await response.json()) + async with self._judge_endpoint_max_concurrency: + try: + response = await self.server_client.post( + server_name=cfg.judge_model_server.name, + url_path="/v1/responses", + json=responses_create_params, + ) + judge_response = NeMoGymResponse.model_validate(await response.json()) + except Exception as e: + print( + f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}", + flush=True, + ) + raise e + eval_record = JudgeEvaluation( responses_create_params=responses_create_params, response=judge_response, diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml index 55bfb96d..b08bcdb2 100644 --- a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml +++ b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml @@ -7,64 +7,9 @@ equivalence_llm_judge: name: policy_model judge_responses_create_params: input: [] - judge_prompt_template: |- - ===== System role ===== - You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence. - - Grading priorities (in order): - 1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations). - 2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD. - - Rules: - - Treat GOLD as authoritative for what counts as correct. - - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set. - - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities). - - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent. - - Be concise. Do NOT reveal or rewrite the GOLD. - - Show your reason why they are equivalent or not equivalent first and then provide the output. - - Output (at the end after double newlines): - - If equivalent: [[A=B]] they are equivalent - - If not equivalent: [[A!=B]] they are not equivalent - - ===== Example 1 (equivalent) ===== - QUESTION: - State Avogadro’s constant (include units). - - GOLD: - 6.022 × 10^23 mol^-1 - - CANDIDATE: - 6.022e23 per mole. - - The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers. - - [[A=B]] they are equivalent - - ===== Example 2 (not equivalent) ===== - QUESTION: - State the first law of thermodynamics for a closed system and identify what each symbol represents. - - GOLD: - ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. - - CANDIDATE: - ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. - - The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match. - - [[A!=B]] they are not equivalent - - ===== Inputs ===== - QUESTION: - {question} - - GOLD: - {expected_answer} - - CANDIDATE: - {generated_answer} + judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt + # Concurrency and rate limiting configuration + judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits judge_system_message: null judge_equal_label: "[[A=B]]" judge_not_equal_label: "[[A!=B]]" diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml new file mode 120000 index 00000000..b8218928 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/configs/lc.yaml @@ -0,0 +1 @@ +lc_judge.yaml \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml new file mode 100644 index 00000000..db3843b5 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml @@ -0,0 +1,35 @@ +lc_judge: + resources_servers: + equivalence_llm_judge: + entrypoint: app.py + judge_model_server: + type: responses_api_models + name: policy_model + judge_responses_create_params: + input: [] + judge_prompt_template_fpath: prompt_templates/lc_judge.txt + judge_endpoint_max_concurrency: 64 + judge_system_message: null + judge_equal_label: CORRECT + judge_not_equal_label: INCORRECT + check_twice_swap: false + reward_if_swap_fails: 0.0 + question_extract_regex: ^QUESTION:\s*(.*)$ + response_extract_regex: null + domain: knowledge + verified: false +lc_judge_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: lc_judge + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: example + type: example + license: TBD + jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt new file mode 100644 index 00000000..7a7de001 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt @@ -0,0 +1,58 @@ + +===== System role ===== +You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence. + +Grading priorities (in order): +1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations). +2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD. + +Rules: +- Treat GOLD as authoritative for what counts as correct. +- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set. +- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities). +- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent. +- Be concise. Do NOT reveal or rewrite the GOLD. + +Show your reason why they are equivalent or not equivalent first and then provide the output. + +Output (at the end after double newlines): +- If equivalent: [[A=B]] they are equivalent +- If not equivalent: [[A!=B]] they are not equivalent + +===== Example 1 (equivalent) ===== +QUESTION: +State Avogadro’s constant (include units). + +GOLD: +6.022 × 10^23 mol^-1 + +CANDIDATE: +6.022e23 per mole. + +The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers. + +[[A=B]] they are equivalent + +===== Example 2 (not equivalent) ===== +QUESTION: +State the first law of thermodynamics for a closed system and identify what each symbol represents. + +GOLD: +ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. + +CANDIDATE: +ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. + +The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match. + +[[A!=B]] they are not equivalent + +===== Inputs ===== +QUESTION: +{question} + +GOLD: +{expected_answer} + +CANDIDATE: +{generated_answer} \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt new file mode 120000 index 00000000..b200b884 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt @@ -0,0 +1 @@ +lc_judge.txt \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt new file mode 100644 index 00000000..bb44669d --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt @@ -0,0 +1,9 @@ + +Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. +For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER. + +The question, for reference only: {question} +The OFFICIAL ANSWER: {expected_answer} +CANDIDATE ANSWER TO ASSESS: {generated_answer} + +Reply only with CORRECT or INCORRECT. \ No newline at end of file