Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions resources_servers/equivalence_llm_judge/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
# limitations under the License.
from __future__ import annotations

import asyncio
import re
from contextlib import nullcontext
from typing import Any, Optional

from fastapi import FastAPI
Expand Down Expand Up @@ -57,9 +59,10 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
name: str = "equivalence_llm_judge"
judge_model_server: ModelServerRef
judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
judge_endpoint_max_concurrency: Optional[int] = None

judge_system_message: Optional[str] = None
judge_prompt_template: str
judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
judge_equal_label: str = "[[A=B]]"
judge_not_equal_label: str = "[[A!=B]]"
# Optional regex to extract the question from the last user message.
Expand Down Expand Up @@ -249,6 +252,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):

config: LLMJudgeResourcesServerConfig

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

if self.config.judge_endpoint_max_concurrency is not None:
self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
else:
self._judge_endpoint_max_concurrency = nullcontext()

with open(self.config.judge_prompt_template_fpath, "r") as f:
self._judge_prompt_template = f.read().strip()

def setup_webserver(self) -> FastAPI:
app = super().setup_webserver()
return app
Expand Down Expand Up @@ -419,7 +433,7 @@ async def _generate_judge_evaluation(
not_equal_label = cfg.judge_not_equal_label

responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
prompt_template = cfg.judge_prompt_template
prompt_template = self._judge_prompt_template
system_message = cfg.judge_system_message

user_prompt = prompt_template.format(
Expand All @@ -432,12 +446,21 @@ async def _generate_judge_evaluation(
msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
responses_create_params.input = msgs

response = await self.server_client.post(
server_name=cfg.judge_model_server.name,
url_path="/v1/responses",
json=responses_create_params,
)
judge_response = NeMoGymResponse.model_validate(await response.json())
async with self._judge_endpoint_max_concurrency:
try:
response = await self.server_client.post(
server_name=cfg.judge_model_server.name,
url_path="/v1/responses",
json=responses_create_params,
)
judge_response = NeMoGymResponse.model_validate(await response.json())
except Exception as e:
print(
f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
flush=True,
)
raise e

eval_record = JudgeEvaluation(
responses_create_params=responses_create_params,
response=judge_response,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,9 @@ equivalence_llm_judge:
name: policy_model
judge_responses_create_params:
input: []
judge_prompt_template: |-
===== System role =====
You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.

Grading priorities (in order):
1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.

Rules:
- Treat GOLD as authoritative for what counts as correct.
- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
- Be concise. Do NOT reveal or rewrite the GOLD.

Show your reason why they are equivalent or not equivalent first and then provide the output.

Output (at the end after double newlines):
- If equivalent: [[A=B]] they are equivalent
- If not equivalent: [[A!=B]] they are not equivalent

===== Example 1 (equivalent) =====
QUESTION:
State Avogadro’s constant (include units).

GOLD:
6.022 × 10^23 mol^-1

CANDIDATE:
6.022e23 per mole.

The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.

[[A=B]] they are equivalent

===== Example 2 (not equivalent) =====
QUESTION:
State the first law of thermodynamics for a closed system and identify what each symbol represents.

GOLD:
ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

CANDIDATE:
ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.

[[A!=B]] they are not equivalent

===== Inputs =====
QUESTION:
{question}

GOLD:
{expected_answer}

CANDIDATE:
{generated_answer}
judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
# Concurrency and rate limiting configuration
judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
judge_system_message: null
judge_equal_label: "[[A=B]]"
judge_not_equal_label: "[[A!=B]]"
Expand Down
1 change: 1 addition & 0 deletions resources_servers/equivalence_llm_judge/configs/lc.yaml
35 changes: 35 additions & 0 deletions resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
lc_judge:
resources_servers:
equivalence_llm_judge:
entrypoint: app.py
judge_model_server:
type: responses_api_models
name: policy_model
judge_responses_create_params:
input: []
judge_prompt_template_fpath: prompt_templates/lc_judge.txt
judge_endpoint_max_concurrency: 64
judge_system_message: null
judge_equal_label: CORRECT
judge_not_equal_label: INCORRECT
check_twice_swap: false
reward_if_swap_fails: 0.0
question_extract_regex: ^QUESTION:\s*(.*)$
response_extract_regex: null
domain: knowledge
verified: false
lc_judge_simple_agent:
responses_api_agents:
simple_agent:
entrypoint: app.py
resources_server:
type: resources_servers
name: lc_judge
model_server:
type: responses_api_models
name: policy_model
datasets:
- name: example
type: example
license: TBD
jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

===== System role =====
You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.

Grading priorities (in order):
1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.

Rules:
- Treat GOLD as authoritative for what counts as correct.
- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
- Be concise. Do NOT reveal or rewrite the GOLD.

Show your reason why they are equivalent or not equivalent first and then provide the output.

Output (at the end after double newlines):
- If equivalent: [[A=B]] they are equivalent
- If not equivalent: [[A!=B]] they are not equivalent

===== Example 1 (equivalent) =====
QUESTION:
State Avogadro’s constant (include units).

GOLD:
6.022 × 10^23 mol^-1

CANDIDATE:
6.022e23 per mole.

The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.

[[A=B]] they are equivalent

===== Example 2 (not equivalent) =====
QUESTION:
State the first law of thermodynamics for a closed system and identify what each symbol represents.

GOLD:
ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

CANDIDATE:
ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.

[[A!=B]] they are not equivalent

===== Inputs =====
QUESTION:
{question}

GOLD:
{expected_answer}

CANDIDATE:
{generated_answer}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.

The question, for reference only: {question}
The OFFICIAL ANSWER: {expected_answer}
CANDIDATE ANSWER TO ASSESS: {generated_answer}

Reply only with CORRECT or INCORRECT.
Loading