cleaning up toxicity visbility in telemetry (scores, response_feedback). updating residual error score calculation to have no penalty for adhereed instructions (follow probability >= 0.5) as per Alex's recommendation. also updated the description of a test case

ashah-aanya · ashah-aanya · commit e7a4410de11e · 2025-07-30T10:50:18.000-07:00
diff --git a/aimon/reprompting_api/pipeline.py b/aimon/reprompting_api/pipeline.py
@@ -1,7 +1,7 @@
 from aimon.reprompting_api.config import RepromptingConfig, StopReasons
 from aimon.reprompting_api.telemetry import TelemetryLogger
 from aimon.reprompting_api.reprompter import Reprompter
-from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score
+from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score, get_failed_toxicity_instructions
 from aimon import Detect
 import time
 import random
@@ -326,9 +326,10 @@ def get_response_feedback(self, result):
             """
             scores = {
                 "groundedness": result.detect_response.groundedness.get("score", 0.0),
-                "instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0)
+                "instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0),
+                "toxicity": result.detect_response.toxicity.get("score", 0.0)
             }
-            feedback = get_failed_instructions(result)
+            feedback = get_failed_instructions(result) + get_failed_toxicity_instructions(result)
             return scores, feedback
     
     def _build_corrective_prompt(self, payload, result):
diff --git a/aimon/reprompting_api/telemetry.py b/aimon/reprompting_api/telemetry.py
@@ -51,7 +51,7 @@ def emit(
             "residual_error": residual_error,
             "failed_instructions_count": failed_instructions_count,
             "stop_reason": stop_reason,
-            "promp_template": prompt,
+            "prompt_template": prompt,
             "response_text": response_text,
         }
         self.memory_store.append(telemetry)
diff --git a/aimon/reprompting_api/tests/test_reprompting_cases.py b/aimon/reprompting_api/tests/test_reprompting_cases.py
@@ -88,7 +88,7 @@ def config_iteration_limit():
         return_telemetry=True,
         return_aimon_summary=True,
         application_name="api_test",
-        max_iterations=-1,
+        max_iterations=1,
     )
 
 # --- Helper to print results nicely ---
@@ -132,7 +132,7 @@ def test_latency_limit(my_llm, config_high_latency):
 
 @pytest.mark.integration
 def test_iteration_limit(my_llm, config_iteration_limit):
-    """Test behavior when max_iterations is unlimited (-1)."""
+    """Test behavior when max_iterations is 1."""
     result = run_reprompting_pipeline(
         user_query="Our systems are showing vulnerability alerts but we can't find the patch file in the vendor portal. What should we do?",
         system_prompt="Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)",
@@ -141,7 +141,7 @@ def test_iteration_limit(my_llm, config_iteration_limit):
         reprompting_config=config_iteration_limit,
         user_instructions=["do not use the letter e","only use the letter e"]
     )
-    print_result("Iteration Limit Test (-1 = unlimited)", result)
+    print_result("Iteration Limit Test (no re-prompting, only 1 iteration allowed)", result)
     assert "best_response" in result
 
 @pytest.mark.integration
diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py
@@ -133,41 +133,39 @@ def get_residual_error_score(result):
     Compute a normalized residual error score (0–1) based on:
     - Groundedness follow probabilities
     - Instruction adherence follow probabilities
-    - Toxicity failures (adds a strong penalty)
+    - Toxicity (inverted: 1 - follow_probability)
 
     Logic:
-    1. Compute a base penalty using groundedness & adherence:
-       - Each instruction's penalty = (1 - p), doubled if p < 0.5.
-       - Average across all instructions for a base score.
-    2. Add a flat toxicity penalty (+0.3) if any toxicity failures exist.
-    3. Clamp the final score to [0,1].
-
-    Args:
-        result: AIMon detection result with `instruction_adherence`, `groundedness`, and `toxicity` sections.
-
-    Returns:
-        float: Residual error score (0 = perfect, 1 = worst). The float is rounded to two decimal places.
+    1. Collect follow probabilities for groundedness & adherence.
+    2. For toxicity, use 1 - follow_probability (since high follow = low error).
+    3. Compute a penalized average using the helper.
+    4. Clamp the final score to [0,1].
     """
-    combined_probs = [
-        item["follow_probability"]
-        for source in ["groundedness", "instruction_adherence"]
-        for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
-    ]
-    base_penalty = penalized_average(combined_probs) if combined_probs else 0.0
+    combined_probs = []
 
-    toxicity_penalty = 0.3 if _count_toxicity_failures(result) > 0 else 0.0
+    for source in ["groundedness", "instruction_adherence"]:
+        combined_probs.extend([
+            item["follow_probability"]
+            for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
+        ])
 
-    residual_error_score = base_penalty + toxicity_penalty
-    residual_error_score = min(1.0, max(0.0, residual_error_score))
+    # For toxicity, invert the follow probability
+    combined_probs.extend([
+        1 - item["follow_probability"]
+        for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", [])
+    ])
 
+    residual_error_score = penalized_average(combined_probs) if combined_probs else 0.0
+    residual_error_score = min(1.0, max(0.0, residual_error_score))
     return round(residual_error_score, 2)
 
 
 def penalized_average(probs: List[float]) -> float:
     """
     Compute a penalized average of follow probabilities.
 
-    Penalizes probabilities <0.5 more heavily by doubling their penalty.
+    Penalizes probabilities <0.5 more heavily by doubling their penalty. 
+    Probabilities > 0.5 (passed instructions) recieve no penalty
 
     Args:
         probs (List[float]): A list of follow probabilities.
@@ -178,7 +176,7 @@ def penalized_average(probs: List[float]) -> float:
     penalties = []
     for p in probs:
         if p >= 0.5:
-            penalty = 1 - p
+            penalty = 0
         else:
             penalty = (1 - p) * 2  # heavier penalty
         penalties.append(penalty)

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ def emit(`
`51`	`51`	`"residual_error": residual_error,`
`52`	`52`	`"failed_instructions_count": failed_instructions_count,`
`53`	`53`	`"stop_reason": stop_reason,`
`54`		`- "promp_template": prompt,`
	`54`	`+ "prompt_template": prompt,`
`55`	`55`	`"response_text": response_text,`
`56`	`56`	`}`
`57`	`57`	`self.memory_store.append(telemetry)`