plexe-ai
diff --git a/‎CLAUDE.md‎
Lines changed: 1 addition & 1 deletion b/‎CLAUDE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/house_prices.py‎
Lines changed: 76 additions & 0 deletions b/‎examples/house_prices.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎…mples/santander_customer_transactions.py‎ ‎examples/santander_transactions.py‎examples/santander_customer_transactions.py renamed to examples/santander_transactions.py
Lines changed: 6 additions & 4 deletions b/‎…mples/santander_customer_transactions.py‎ ‎examples/santander_transactions.py‎examples/santander_customer_transactions.py renamed to examples/santander_transactions.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/spaceship_titanic.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/spaceship_titanic.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎plexe/config.py‎
Lines changed: 2 additions & 73 deletions b/‎plexe/config.py‎
Lines changed: 2 additions & 73 deletions
diff --git a/‎plexe/internal/agents.py‎
Lines changed: 20 additions & 22 deletions b/‎plexe/internal/agents.py‎
Lines changed: 20 additions & 22 deletions
@@ -28,7 +28,7 @@
 - **Linting**: Ruff with E203/E501/E402 ignored
 - **Typing**: Use type hints and Pydantic models
 - **Naming**: snake_case (functions/vars), CamelCase (classes)
-- **Imports**: Group stdlib, third-party, then local imports; NO LOCAL IMPORTS, always import at the top of the file
+- **Imports**: Group stdlib, third-party, then local imports; NO IMPORTS INSIDE FUNCTIONS, always import at the top of the file
 - **__init__.py**: No code in __init__.py files except in plexe/__init__.py for convenience
 - **Docstrings**: Required for public modules/classes/functions; Sphinx style without type hints
 - **Testing**: Write pytest tests for all new functionality
 
@@ -0,0 +1,76 @@
+"""
+This script demonstrates how to run the plexe ML engineering agent to build a predictive model. The example
+uses the Kaggle 'House Prices - Advanced Regression Techniques' competition's training dataset.
+
+The dataset is owned and hosted by Kaggle, and is available for download at
+https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data under the MIT license
+(https://www.mit.edu/~amini/LICENSE.md). This dataset is not part of the plexe package or in any way
+affiliated to it, and Plexe AI claims no rights over it. The dataset is used here for demonstration purposes
+only. Please refer to the Kaggle competition page for more details on the dataset and its usage.
+
+Citation:
+Anna Montoya and DataCanary. House Prices - Advanced Regression Techniques.
+https://kaggle.com/competitions/house-prices-advanced-regression-techniques, 2016. Kaggle.
+"""
+
+# NOTE: you must download the dataset from Kaggle for this example to work
+
+from datetime import datetime
+import pandas as pd
+
+import plexe
+from plexe.internal.common.provider import ProviderConfig
+
+
+# Step 1: Define the model
+# Note: for conciseness we leave the input schema empty and let plexe infer it
+model = plexe.Model(
+    intent=(
+        "With 79 explanatory variables describing aspects of residential homes in Ames, Iowa, predict "
+        "the final price of each home. Use only linear regression and decision tree models, no ensembling. "
+        "The models must be extremely simple and quickly trainable on extremely constrained hardware."
+    ),
+    output_schema={
+        "SalePrice": float,
+    },
+)
+
+# Step 2: Build the model using the training dataset
+# 2A [OPTIONAL]: Define MLFlow callback for tracking
+mlflow_callback = plexe.callbacks.MLFlowCallback(
+    tracking_uri="http://127.0.0.1:8080",
+    experiment_name=f"house-prices-{datetime.now().strftime('%Y%m%d-%H%M%S')  }",
+)
+# 2B: Build the model with the dataset
+# NOTE: In order to run this example, you will need to download the dataset from Kaggle
+model.build(
+    datasets=[pd.read_csv("examples/datasets/house-prices-train.csv")],
+    provider=ProviderConfig(
+        default_provider="openai/gpt-4o",
+        orchestrator_provider="anthropic/claude-3-7-sonnet-20250219",
+        research_provider="openai/gpt-4o",
+        engineer_provider="anthropic/claude-3-7-sonnet-20250219",
+        ops_provider="anthropic/claude-3-7-sonnet-20250219",
+        tool_provider="openai/gpt-4o",
+    ),
+    max_iterations=2,
+    timeout=1800,  # 30 minute timeout
+    run_timeout=180,
+    verbose=False,
+    callbacks=[mlflow_callback],
+    chain_of_thought=True,  # Enable chain of thought output
+)
+
+# Step 3: Save the model
+plexe.save_model(model, "house-prices.tar.gz")
+
+# Step 4: Run a prediction on the built model
+test_df = pd.read_csv("examples/datasets/house-prices-test.csv").sample(10)
+predictions = pd.DataFrame.from_records([model.predict(x) for x in test_df.to_dict(orient="records")])
+
+# Step 5: print a sample of predictions
+print(predictions)
+
+# Step 6: Print model description
+description = model.describe()
+print(description.as_text())
@@ -26,7 +26,9 @@
 model = plexe.Model(
     intent=(
         "Identify which customers will make a specific transaction in the future, irrespective of the amount "
-        "of money transacted. For each Id, make a binary prediction of the 'target' variable."
+        "of money transacted. For each Id, make a binary prediction of the 'target' variable. Use only linear "
+        "regression and decision tree models, no ensembling. The models must be extremely simple and quickly "
+        "trainable on extremely constrained hardware."
     ),
     output_schema={
         "target": int,
@@ -51,7 +53,7 @@
         ops_provider="anthropic/claude-3-7-sonnet-20250219",
         tool_provider="openai/gpt-4o",
     ),
-    max_iterations=8,
+    max_iterations=5,
     timeout=1800,  # 30 minute timeout
     run_timeout=180,
     verbose=False,
@@ -63,11 +65,11 @@
 plexe.save_model(model, "santander_transactions_model.tar.gz")
 
 # Step 4: Run a prediction on the built model
-test_df = pd.read_csv("examples/datasets/santander-transactions-test-mini.csv")
+test_df = pd.read_csv("examples/datasets/santander-transactions-test-mini.csv").sample(10)
 predictions = pd.DataFrame.from_records([model.predict(x) for x in test_df.to_dict(orient="records")])
 
 # Step 5: print a sample of predictions
-print(predictions.sample(10))
+print(predictions)
 
 # Step 6: Print model description
 description = model.describe()
 
@@ -24,7 +24,8 @@
 model = plexe.Model(
     intent=(
         "From features describing a Spaceship Titanic passenger's information, determine whether they were "
-        "transported or not."
+        "transported or not. Use only linear regression and decision tree models, no ensembling. The models "
+        "must be extremely simple and quickly trainable on extremely constrained hardware."
     ),
     input_schema={
         "PassengerId": str,
@@ -64,7 +65,7 @@
         ops_provider="anthropic/claude-3-7-sonnet-20250219",
         tool_provider="openai/gpt-4o",
     ),
-    max_iterations=4,
+    max_iterations=1,
     timeout=300,  # 5 minute timeout
     run_timeout=150,
     verbose=False,
@@ -76,11 +77,11 @@
 plexe.save_model(model, "spaceship_titanic_model.tar.gz")
 
 # Step 4: Run a prediction on the built model
-test_df = pd.read_csv("examples/datasets/spaceship-titanic-test.csv")
+test_df = pd.read_csv("examples/datasets/spaceship-titanic-test.csv").sample(10)
 predictions = pd.DataFrame.from_records([model.predict(x) for x in test_df.to_dict(orient="records")])
 
 # Step 5: print a sample of predictions
-print(predictions.sample(10))
+print(predictions)
 
 # Step 6: Print model description
 description = model.describe()
 
@@ -93,6 +93,7 @@ class _CodeGenerationConfig:
                 "typing",
                 "dataclasses",
                 "json",
+                "io",
                 "time",
                 "datetime",
                 "os",
@@ -109,6 +110,7 @@ class _CodeGenerationConfig:
                 "logging",
                 "importlib",
                 "types",
+                "plexe",
             ]
         )
 
@@ -252,79 +254,6 @@ def training_review(self, problem_statement, plan, training_code, problems, allo
             allowed_packages=allowed_packages,
         )
 
-    def inference_system(self) -> str:
-        return self._render("inference/system_prompt.jinja")
-
-    def inference_load(self, predictor_template, training_code) -> str:
-        return self._render(
-            "inference/load.jinja",
-            predictor_template=predictor_template,
-            training_code=training_code,
-        )
-
-    def inference_preprocess(self, inference_code, input_schema, training_code) -> str:
-        return self._render(
-            "inference/preprocess.jinja",
-            inference_code=inference_code,
-            input_schema=input_schema,
-            training_code=training_code,
-        )
-
-    def inference_postprocess(self, inference_code, output_schema, training_code) -> str:
-        return self._render(
-            "inference/postprocess.jinja",
-            inference_code=inference_code,
-            output_schema=output_schema,
-            training_code=training_code,
-        )
-
-    def inference_predict(self, output_schema, input_schema, training_code, inference_code) -> str:
-        return self._render(
-            "inference/predict.jinja",
-            output_schema=output_schema,
-            input_schema=input_schema,
-            training_code=training_code,
-            inference_code=inference_code,
-        )
-
-    def inference_combine(self, inference_code, predictor_interface_source) -> str:
-        return self._render(
-            "inference/combine.jinja",
-            inference_code=inference_code,
-            predictor_interface_source=predictor_interface_source,
-        )
-
-    def inference_fix(self, predictor_interface_source, predictor_template, inference_code, review, problems) -> str:
-        return self._render(
-            "inference/fix.jinja",
-            predictor_interface_source=predictor_interface_source,
-            predictor_template=predictor_template,
-            inference_code=inference_code,
-            review=review,
-            problems=problems,
-        )
-
-    def inference_review(
-        self,
-        predictor_interface_source,
-        predictor_template,
-        inference_code,
-        input_schema,
-        output_schema,
-        training_code,
-        problems,
-    ) -> str:
-        return self._render(
-            "inference/review.jinja",
-            predictor_interface_source=predictor_interface_source,
-            predictor_template=predictor_template,
-            inference_code=inference_code,
-            input_schema=input_schema,
-            output_schema=output_schema,
-            training_code=training_code,
-            problems=problems,
-        )
-
     def review_system(self) -> str:
         return self._render("review/system_prompt.jinja")
 
 
@@ -17,22 +17,21 @@
 from plexe.internal.models.entities.metric import Metric
 from plexe.internal.models.entities.metric import MetricComparator, ComparisonMethod
 from plexe.internal.models.interfaces.predictor import Predictor
-from plexe.internal.models.tools.code_generation import (
-    generate_inference_code,
-    fix_inference_code,
-    generate_training_code,
-    fix_training_code,
+from plexe.internal.models.tools.training import (
+    get_generate_training_code,
+    get_fix_training_code,
 )
+from plexe.internal.models.tools.evaluation import get_review_finalised_model
+from plexe.internal.models.tools.metrics import get_select_target_metric
 from plexe.internal.models.tools.datasets import split_datasets, create_input_sample
-from plexe.internal.models.tools.evaluation import review_finalised_model
 from plexe.internal.models.tools.execution import get_executor_tool
-from plexe.internal.models.tools.metrics import select_target_metric
 from plexe.internal.models.tools.response_formatting import (
     format_final_orchestrator_agent_response,
     format_final_mle_agent_response,
     format_final_mlops_agent_response,
 )
-from plexe.internal.models.tools.validation import validate_inference_code, validate_training_code
+from plexe.internal.models.tools.context import get_inference_context_tool
+from plexe.internal.models.tools.validation import validate_training_code, validate_inference_code
 
 logger = logging.getLogger(__name__)
 
@@ -62,6 +61,7 @@ def __init__(
         ml_researcher_model_id: str = "openai/gpt-4o",
         ml_engineer_model_id: str = "anthropic/claude-3-7-sonnet-20250219",
         ml_ops_engineer_model_id: str = "anthropic/claude-3-7-sonnet-20250219",
+        tool_model_id: str = "openai/gpt-4o",
         verbose: bool = False,
         max_steps: int = 30,
         distributed: bool = False,
@@ -75,6 +75,7 @@ def __init__(
             ml_researcher_model_id: Model ID for the ML researcher agent
             ml_engineer_model_id: Model ID for the ML engineer agent
             ml_ops_engineer_model_id: Model ID for the ML ops engineer agent
+            tool_model_id: Model ID for the model used inside tool calls
             verbose: Whether to display detailed agent logs
             max_steps: Maximum number of steps for the orchestrator agent
             distributed: Whether to run the agents in a distributed environment
@@ -84,6 +85,7 @@ def __init__(
         self.ml_researcher_model_id = ml_researcher_model_id
         self.ml_engineer_model_id = ml_engineer_model_id
         self.ml_ops_engineer_model_id = ml_ops_engineer_model_id
+        self.tool_model_id = tool_model_id
         self.verbose = verbose
         self.max_steps = max_steps
         self.distributed = distributed
@@ -103,7 +105,6 @@ def __init__(
                 "- input schema for the model"
                 "- output schema for the model"
                 "- the name and comparison method of the metric to optimise"
-                "- the identifier of the LLM that should be used for plan generation"
             ),
             model=LiteLLMModel(model_id=self.ml_researcher_model_id),
             tools=[],
@@ -126,13 +127,12 @@ def __init__(
                 "- the full solution plan that outlines how to solve this problem"
                 "- the split train/validation dataset names"
                 "- the working directory to use for model execution"
-                "- the identifier of the LLM that should be used for code generation"
             ),
             model=LiteLLMModel(model_id=self.ml_engineer_model_id),
             tools=[
-                generate_training_code,
+                get_generate_training_code(self.tool_model_id),
                 validate_training_code,
-                fix_training_code,
+                get_fix_training_code(self.tool_model_id),
                 get_executor_tool(distributed),
                 format_final_mle_agent_response,
             ],
@@ -143,27 +143,25 @@ def __init__(
         )
 
         # Create predictor builder agent - creates inference code
-        self.mlops_engineer = ToolCallingAgent(
+        self.mlops_engineer = CodeAgent(
             name="MLOperationsEngineer",
             description=(
-                "Expert ML operations engineer that writes inference code for ML models to be used in production. "
-                "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:"
+                "Expert ML operations engineer that analyzes training code and creates high-quality production-ready "
+                "inference code for ML models. To work effectively, as part of the 'task' prompt the agent STRICTLY requires:"
                 "- input schema for the model"
                 "- output schema for the model"
                 "- the 'training code id' of the training code produced by the MLEngineer agent"
-                "- the identifier of the LLM that should be used for code generation"
             ),
             model=LiteLLMModel(model_id=self.ml_ops_engineer_model_id),
             tools=[
-                split_datasets,
-                generate_inference_code,
+                get_inference_context_tool(self.tool_model_id),
                 validate_inference_code,
-                fix_inference_code,
                 format_final_mlops_agent_response,
             ],
             add_base_tools=False,
             verbosity_level=self.specialist_verbosity,
-            prompt_templates=get_prompt_templates("toolcalling_agent.yaml", "mlops_prompt_templates.yaml"),
+            additional_authorized_imports=config.code_generation.authorized_agent_imports,
+            prompt_templates=get_prompt_templates("code_agent.yaml", "mlops_prompt_templates.yaml"),
             planning_interval=8,
             step_callbacks=[self.chain_of_thought_callable],
         )
@@ -173,8 +171,8 @@ def __init__(
             name="Orchestrator",
             model=LiteLLMModel(model_id=self.orchestrator_model_id),
             tools=[
-                select_target_metric,
-                review_finalised_model,
+                get_select_target_metric(self.tool_model_id),
+                get_review_finalised_model(self.tool_model_id),
                 split_datasets,
                 create_input_sample,
                 format_final_orchestrator_agent_response,