Skip to content

Commit e7a4410

Browse files
committed
cleaning up toxicity visbility in telemetry (scores, response_feedback). updating residual error score calculation to have no penalty for adhereed instructions (follow probability >= 0.5) as per Alex's recommendation. also updated the description of a test case
1 parent 904320e commit e7a4410

File tree

4 files changed

+29
-30
lines changed

4 files changed

+29
-30
lines changed

aimon/reprompting_api/pipeline.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from aimon.reprompting_api.config import RepromptingConfig, StopReasons
22
from aimon.reprompting_api.telemetry import TelemetryLogger
33
from aimon.reprompting_api.reprompter import Reprompter
4-
from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score
4+
from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score, get_failed_toxicity_instructions
55
from aimon import Detect
66
import time
77
import random
@@ -326,9 +326,10 @@ def get_response_feedback(self, result):
326326
"""
327327
scores = {
328328
"groundedness": result.detect_response.groundedness.get("score", 0.0),
329-
"instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0)
329+
"instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0),
330+
"toxicity": result.detect_response.toxicity.get("score", 0.0)
330331
}
331-
feedback = get_failed_instructions(result)
332+
feedback = get_failed_instructions(result) + get_failed_toxicity_instructions(result)
332333
return scores, feedback
333334

334335
def _build_corrective_prompt(self, payload, result):

aimon/reprompting_api/telemetry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def emit(
5151
"residual_error": residual_error,
5252
"failed_instructions_count": failed_instructions_count,
5353
"stop_reason": stop_reason,
54-
"promp_template": prompt,
54+
"prompt_template": prompt,
5555
"response_text": response_text,
5656
}
5757
self.memory_store.append(telemetry)

aimon/reprompting_api/tests/test_reprompting_cases.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def config_iteration_limit():
8888
return_telemetry=True,
8989
return_aimon_summary=True,
9090
application_name="api_test",
91-
max_iterations=-1,
91+
max_iterations=1,
9292
)
9393

9494
# --- Helper to print results nicely ---
@@ -132,7 +132,7 @@ def test_latency_limit(my_llm, config_high_latency):
132132

133133
@pytest.mark.integration
134134
def test_iteration_limit(my_llm, config_iteration_limit):
135-
"""Test behavior when max_iterations is unlimited (-1)."""
135+
"""Test behavior when max_iterations is 1."""
136136
result = run_reprompting_pipeline(
137137
user_query="Our systems are showing vulnerability alerts but we can't find the patch file in the vendor portal. What should we do?",
138138
system_prompt="Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)",
@@ -141,7 +141,7 @@ def test_iteration_limit(my_llm, config_iteration_limit):
141141
reprompting_config=config_iteration_limit,
142142
user_instructions=["do not use the letter e","only use the letter e"]
143143
)
144-
print_result("Iteration Limit Test (-1 = unlimited)", result)
144+
print_result("Iteration Limit Test (no re-prompting, only 1 iteration allowed)", result)
145145
assert "best_response" in result
146146

147147
@pytest.mark.integration

aimon/reprompting_api/utils.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -133,41 +133,39 @@ def get_residual_error_score(result):
133133
Compute a normalized residual error score (0–1) based on:
134134
- Groundedness follow probabilities
135135
- Instruction adherence follow probabilities
136-
- Toxicity failures (adds a strong penalty)
136+
- Toxicity (inverted: 1 - follow_probability)
137137
138138
Logic:
139-
1. Compute a base penalty using groundedness & adherence:
140-
- Each instruction's penalty = (1 - p), doubled if p < 0.5.
141-
- Average across all instructions for a base score.
142-
2. Add a flat toxicity penalty (+0.3) if any toxicity failures exist.
143-
3. Clamp the final score to [0,1].
144-
145-
Args:
146-
result: AIMon detection result with `instruction_adherence`, `groundedness`, and `toxicity` sections.
147-
148-
Returns:
149-
float: Residual error score (0 = perfect, 1 = worst). The float is rounded to two decimal places.
139+
1. Collect follow probabilities for groundedness & adherence.
140+
2. For toxicity, use 1 - follow_probability (since high follow = low error).
141+
3. Compute a penalized average using the helper.
142+
4. Clamp the final score to [0,1].
150143
"""
151-
combined_probs = [
152-
item["follow_probability"]
153-
for source in ["groundedness", "instruction_adherence"]
154-
for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
155-
]
156-
base_penalty = penalized_average(combined_probs) if combined_probs else 0.0
144+
combined_probs = []
157145

158-
toxicity_penalty = 0.3 if _count_toxicity_failures(result) > 0 else 0.0
146+
for source in ["groundedness", "instruction_adherence"]:
147+
combined_probs.extend([
148+
item["follow_probability"]
149+
for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
150+
])
159151

160-
residual_error_score = base_penalty + toxicity_penalty
161-
residual_error_score = min(1.0, max(0.0, residual_error_score))
152+
# For toxicity, invert the follow probability
153+
combined_probs.extend([
154+
1 - item["follow_probability"]
155+
for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", [])
156+
])
162157

158+
residual_error_score = penalized_average(combined_probs) if combined_probs else 0.0
159+
residual_error_score = min(1.0, max(0.0, residual_error_score))
163160
return round(residual_error_score, 2)
164161

165162

166163
def penalized_average(probs: List[float]) -> float:
167164
"""
168165
Compute a penalized average of follow probabilities.
169166
170-
Penalizes probabilities <0.5 more heavily by doubling their penalty.
167+
Penalizes probabilities <0.5 more heavily by doubling their penalty.
168+
Probabilities > 0.5 (passed instructions) recieve no penalty
171169
172170
Args:
173171
probs (List[float]): A list of follow probabilities.
@@ -178,7 +176,7 @@ def penalized_average(probs: List[float]) -> float:
178176
penalties = []
179177
for p in probs:
180178
if p >= 0.5:
181-
penalty = 1 - p
179+
penalty = 0
182180
else:
183181
penalty = (1 - p) * 2 # heavier penalty
184182
penalties.append(penalty)

0 commit comments

Comments
 (0)