fix: registration of best performing code (#127)

marcellodebernardi · web-flow · commit 4cb81f5d798b · 2025-05-28T14:33:11.000-07:00
* fix: manager sometimes passes wrong code id to mlops agent

* feat: add tool for retrieving model performances

* chore: bump to 0.23.4
diff --git a/plexe/agents/agents.py b/plexe/agents/agents.py
@@ -27,11 +27,12 @@
 from plexe.internal.models.entities.metric import MetricComparator, ComparisonMethod
 from plexe.core.interfaces.predictor import Predictor
 from plexe.tools.datasets import create_input_sample, get_latest_datasets
-from plexe.tools.evaluation import get_review_finalised_model
+from plexe.tools.evaluation import get_review_finalised_model, get_model_performances
 from plexe.tools.metrics import get_select_target_metric
 from plexe.tools.response_formatting import (
     format_final_orchestrator_agent_response,
 )
+from plexe.tools.training import register_best_training_code
 
 logger = logging.getLogger(__name__)
 
@@ -167,6 +168,8 @@ def __init__(
                 get_review_finalised_model(self.tool_model_id),
                 create_input_sample,
                 get_latest_datasets,
+                get_model_performances,
+                register_best_training_code,
                 format_final_orchestrator_agent_response,
             ],
             managed_agents=[
diff --git a/plexe/agents/model_packager.py b/plexe/agents/model_packager.py
@@ -53,10 +53,8 @@ def __init__(
             name="MLOperationsEngineer",
             description=(
                 "Expert ML operations engineer that analyzes training code and creates high-quality production-ready "
-                "inference code for ML models. To work effectively, as part of the 'task' prompt the agent STRICTLY requires:"
-                "- input schema for the model"
-                "- output schema for the model"
-                "- the 'training code id' of the training code produced by the MLEngineer agent"
+                "inference code for ML models. This agent STRICTLY requires the training code of the best model to have "
+                "been registered in the object registry."
             ),
             model=LiteLLMModel(model_id=model_id),
             tools=[
diff --git a/plexe/internal/models/entities/code.py b/plexe/internal/models/entities/code.py
@@ -10,3 +10,4 @@ class Code:
     """Represents a code object."""
 
     code: str = field()
+    performance: float = field(default=None)
diff --git a/plexe/templates/prompts/agent/agent_manager_prompt.jinja b/plexe/templates/prompts/agent/agent_manager_prompt.jinja
@@ -59,6 +59,10 @@ Ensure the output maximizes model performance while adhering to all constraints.
   exactly 0 is almost certainly a bugged model or a sign of overfitting, so this should be ignored.
 - 'MLEngineer' should only be asked to work on implementing ONE plan at a time.
 - 'MLOperationsEngineer' only needs to work on the final, best performing model.
+- Use the 'get_model_performances' tool to remind yourself of the performance of the models
+  produced by 'MLEngineer' before deciding which one is the best.
+- Use the 'register_best_training_code' tool to make the training code of the best performing model available for
+  subsequent instructions.
 - 'MLEngineer' and 'MLOperationsEngineer' return IDs that identify the code they produce. Use these IDs to refer to the
   code they produce in any subsequent instructions.
 - 'ModelTester' should only be called once the model has been completed and is ready for testing.
diff --git a/plexe/templates/prompts/agent/mlops_prompt_templates.yaml b/plexe/templates/prompts/agent/mlops_prompt_templates.yaml
@@ -15,7 +15,7 @@ managed_agent:
     
     ## Process
     1. First, gather all necessary context:
-       - Use `get_inference_context` tool with the training_code_id
+       - Use `get_inference_context` tool to get the training code, schemas, and other relevant information.
        - Use `get_feature_transformer_code` to check for feature transformations, if required
     
     2. Analyze the context to understand:
@@ -43,10 +43,6 @@ managed_agent:
     
     6. Once validation succeeds, use the `format_final_mlops_agent_response` tool with the inference_code_id.
     
-    ## Information Required
-    To complete this task, you need:
-    - The 'training_code_id' from the MLEngineer agent (must be provided in your task)
-    
     ## Available Tools
     - get_inference_context: Retrieve training code, schemas, interface definitions, and other context
     - validate_inference_code: Validate your generated inference code
diff --git a/plexe/tools/context.py b/plexe/tools/context.py
@@ -21,25 +21,22 @@ def get_inference_context_tool(llm_to_use: str) -> Callable:
     """Returns a tool function to get inference context with the model ID pre-filled."""
 
     @tool
-    def get_inference_context(training_code_id: str) -> Dict[str, Any]:
+    def get_inference_context() -> Dict[str, Any]:
         """
         Provides comprehensive context needed for generating inference code. Use this tool to retrieve
         a summary of the training code, schemas, expected inputs for the purpose of planning the inference
         code.
 
-        Args:
-            training_code_id: The ID of the code that was used to train the model
-
         Returns:
             A dictionary containing all context needed for inference code generation
         """
         object_registry = ObjectRegistry()
 
         # Retrieve training code
         try:
-            training_code = object_registry.get(Code, training_code_id).code
+            training_code = object_registry.get(Code, "best_performing_training_code").code
         except Exception as e:
-            raise ValueError(f"Training code with ID {training_code_id} not found: {str(e)}")
+            raise ValueError(f"Training code with ID 'best_performing_training_code' not found: {str(e)}")
 
         # Retrieve schemas
         try:
diff --git a/plexe/tools/evaluation.py b/plexe/tools/evaluation.py
@@ -61,3 +61,30 @@ def review_finalised_model(
         )
 
     return review_finalised_model
+
+
+@tool
+def get_model_performances() -> Dict[str, float]:
+    """
+    Returns the performance of all successfully trained models so far. The performances are returned as a dictionary
+    mapping the 'model training ID' to the performance score. Use this function to remind yourself of the performance
+    of all models, so that you can do things such as select the best performing model for deployment.
+
+    Returns:
+        A dictionary mapping model IDs to their performance scores with structure:
+        {
+            "model_training_id_1": performance_score_1,
+            "model_training_id_2": performance_score_2,
+        }
+    """
+    from plexe.core.object_registry import ObjectRegistry
+
+    object_registry = ObjectRegistry()
+    performances = {}
+
+    for code_id in object_registry.list_by_type(Code):
+        code = object_registry.get(Code, code_id)
+        if code.performance is not None:
+            performances[code_id] = code.performance
+
+    return performances
diff --git a/plexe/tools/execution.py b/plexe/tools/execution.py
@@ -154,7 +154,7 @@ def execute_training_code(
             artifact_paths = node.model_artifacts if node.model_artifacts else []
             artifacts = [Artifact.from_path(p) for p in artifact_paths]
             object_registry.register_multiple(Artifact, {a.name: a for a in artifacts})
-            object_registry.register(Code, execution_id, Code(node.training_code))
+            object_registry.register(Code, execution_id, Code(node.training_code, node.performance.value))
 
             # Return results
             return {
diff --git a/plexe/tools/training.py b/plexe/tools/training.py
@@ -8,12 +8,42 @@
 
 from smolagents import tool
 
+from plexe.core.object_registry import ObjectRegistry
 from plexe.internal.common.provider import Provider
+from plexe.internal.models.entities.code import Code
 from plexe.internal.models.generation.training import TrainingCodeGenerator
 
 logger = logging.getLogger(__name__)
 
 
+@tool
+def register_best_training_code(best_training_code_id: str) -> str:
+    """
+    Register the identifier returned by the MLEngineer for the solution with the best performance in the object
+    registry. This step is required in order for the training code to be available for future use.
+
+    Args:
+        best_training_code_id: 'training_code_id' of the best performing model
+
+    Returns:
+        Success message confirming registration
+    """
+    object_registry = ObjectRegistry()
+
+    try:
+        # Register the testing code with a fixed ID
+        code_id = "best_performing_training_code"
+        code = object_registry.get(Code, best_training_code_id).code
+        object_registry.register(Code, code_id, Code(code), overwrite=True, immutable=True)
+
+        logger.debug(f"✅ Registered model training code with ID '{code_id}'")
+        return f"Successfully registered model training code with ID '{code_id}' for the best performing model."
+
+    except Exception as e:
+        logger.warning(f"⚠️ Error registering training code: {str(e)}")
+        raise RuntimeError(f"Failed to register training code: {str(e)}")
+
+
 def get_training_code_generation_tool(llm_to_use: str) -> Callable:
     """Returns a tool function to generate training code with the model ID pre-filled."""
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plexe"
-version = "0.23.3"
+version = "0.23.4"
 description = "An agentic framework for building ML models from natural language"
 authors = [
     "marcellodebernardi <marcello.debernardi@outlook.com>",

Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@ class Code:`
`10`	`10`	`"""Represents a code object."""`
`11`	`11`
`12`	`12`	`code: str = field()`
	`13`	`+ performance: float = field(default=None)`