NVIDIA-NeMo · fayejf · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py
@@ -21,7 +21,9 @@
 # limitations under the License.
 from __future__ import annotations
 
+import asyncio
 import re
+from contextlib import nullcontext
 from typing import Any, Optional
 
 from fastapi import FastAPI
@@ -57,9 +59,10 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
     name: str = "equivalence_llm_judge"
     judge_model_server: ModelServerRef
     judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
+    judge_endpoint_max_concurrency: Optional[int] = None
 
     judge_system_message: Optional[str] = None
-    judge_prompt_template: str
+    judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
     judge_equal_label: str = "[[A=B]]"
     judge_not_equal_label: str = "[[A!=B]]"
     # Optional regex to extract the question from the last user message.
@@ -249,6 +252,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):
 
     config: LLMJudgeResourcesServerConfig
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.config.judge_endpoint_max_concurrency is not None:
+            self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
+        else:
+            self._judge_endpoint_max_concurrency = nullcontext()
+
+        with open(self.config.judge_prompt_template_fpath, "r") as f:
+            self._judge_prompt_template = f.read().strip()
+
     def setup_webserver(self) -> FastAPI:
         app = super().setup_webserver()
         return app
@@ -419,7 +433,7 @@ async def _generate_judge_evaluation(
         not_equal_label = cfg.judge_not_equal_label
 
         responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
-        prompt_template = cfg.judge_prompt_template
+        prompt_template = self._judge_prompt_template
         system_message = cfg.judge_system_message
 
         user_prompt = prompt_template.format(
@@ -432,12 +446,21 @@ async def _generate_judge_evaluation(
         msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
         responses_create_params.input = msgs
 
-        response = await self.server_client.post(
-            server_name=cfg.judge_model_server.name,
-            url_path="/v1/responses",
-            json=responses_create_params,
-        )
-        judge_response = NeMoGymResponse.model_validate(await response.json())
+        async with self._judge_endpoint_max_concurrency:
+            try:
+                response = await self.server_client.post(
+                    server_name=cfg.judge_model_server.name,
+                    url_path="/v1/responses",
+                    json=responses_create_params,
+                )
+                judge_response = NeMoGymResponse.model_validate(await response.json())
+            except Exception as e:
+                print(
+                    f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
+                    flush=True,
+                )
+                raise e
+
         eval_record = JudgeEvaluation(
             responses_create_params=responses_create_params,
             response=judge_response,

diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
@@ -7,64 +7,9 @@ equivalence_llm_judge:
         name: policy_model
       judge_responses_create_params:
         input: []
-      judge_prompt_template: |-
-        ===== System role =====
-        You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
-
-        Grading priorities (in order):
-        1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
-        2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
-
-        Rules:
-        - Treat GOLD as authoritative for what counts as correct.
-        - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
-        - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
-        - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
-        - Be concise. Do NOT reveal or rewrite the GOLD.
-
-        Show your reason why they are equivalent or not equivalent first and then provide the output.
-
-        Output (at the end after double newlines):
-        - If equivalent: [[A=B]] they are equivalent
-        - If not equivalent: [[A!=B]] they are not equivalent
-
-        ===== Example 1 (equivalent) =====
-        QUESTION:
-        State Avogadro’s constant (include units).
-
-        GOLD:
-        6.022 × 10^23 mol^-1
-
-        CANDIDATE:
-        6.022e23 per mole.
-
-        The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
-
-        [[A=B]] they are equivalent
-
-        ===== Example 2 (not equivalent) =====
-        QUESTION:
-        State the first law of thermodynamics for a closed system and identify what each symbol represents.
-
-        GOLD:
-        ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        CANDIDATE:
-        ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
-
-        [[A!=B]] they are not equivalent
-
-        ===== Inputs =====
-        QUESTION:
-        {question}
-
-        GOLD:
-        {expected_answer}
-
-        CANDIDATE:
-        {generated_answer}
+      judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
+      # Concurrency and rate limiting configuration
+      judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
       judge_system_message: null
       judge_equal_label: "[[A=B]]"
       judge_not_equal_label: "[[A!=B]]"

diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml
@@ -0,0 +1 @@
+lc_judge.yaml
diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
@@ -0,0 +1,35 @@
+lc_judge:
+  resources_servers:
+    equivalence_llm_judge:
+      entrypoint: app.py
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model
+      judge_responses_create_params:
+        input: []
+      judge_prompt_template_fpath: prompt_templates/lc_judge.txt
+      judge_endpoint_max_concurrency: 64
+      judge_system_message: null
+      judge_equal_label: CORRECT
+      judge_not_equal_label: INCORRECT
+      check_twice_swap: false
+      reward_if_swap_fails: 0.0
+      question_extract_regex: ^QUESTION:\s*(.*)$
+      response_extract_regex: null
+      domain: knowledge
+      verified: false
+lc_judge_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: lc_judge
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: example
+        type: example
+        license: TBD
+        jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
@@ -0,0 +1,58 @@
+
+===== System role =====
+You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
+
+Grading priorities (in order):
+1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
+2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
+
+Rules:
+- Treat GOLD as authoritative for what counts as correct.
+- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
+- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
+- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
+- Be concise. Do NOT reveal or rewrite the GOLD.
+
+Show your reason why they are equivalent or not equivalent first and then provide the output.
+
+Output (at the end after double newlines):
+- If equivalent: [[A=B]] they are equivalent
+- If not equivalent: [[A!=B]] they are not equivalent
+
+===== Example 1 (equivalent) =====
+QUESTION:
+State Avogadro’s constant (include units).
+
+GOLD:
+6.022 × 10^23 mol^-1
+
+CANDIDATE:
+6.022e23 per mole.
+
+The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
+
+[[A=B]] they are equivalent
+
+===== Example 2 (not equivalent) =====
+QUESTION:
+State the first law of thermodynamics for a closed system and identify what each symbol represents.
+
+GOLD:
+ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+CANDIDATE:
+ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
+
+[[A!=B]] they are not equivalent
+
+===== Inputs =====
+QUESTION:
+{question}
+
+GOLD:
+{expected_answer}
+
+CANDIDATE:
+{generated_answer}
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
@@ -0,0 +1 @@
+lc_judge.txt
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
@@ -0,0 +1,9 @@
+
+Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
+For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
+
+The question, for reference only: {question}
+The OFFICIAL ANSWER: {expected_answer}
+CANDIDATE ANSWER TO ASSESS: {generated_answer}
+
+Reply only with CORRECT or INCORRECT.