feature: data preview tool (#114)

marcellodebernardi · web-flow · commit 70696bf61680 · 2025-05-12T22:07:57.000-07:00
* fix: simplify santander example

* refactor: remove 'llm_to_use' from tool signature

* chore: yell at claude about imports

* feat: initial implementation of more agentic predictor production

* feat: extract input sample as dict

* feat: simplify inference generation tools

* chore: bump to 0.18.3

* fix: remove unused agent inputs

* fix: include prompt templates in dumpcode.py

* feat: move predictor generation from tools to agent

* fix: register schemas

* fix: remove unused inference prompts

* fix: allow plexe imports for mlops engineer

* feat: extract artifacts in inference context

* feat: add house prices example

* fix: artifact list extraction defined incorrectly

* fix: incorrect sampling in examples

* fix: add io and plexe to allowed imports

* fix: setting llm for extraction incorrectly

* fix: get schemas from registry at inference validation

* fix: artifact extraction can fail silently

* fix: extra space in house prices example

* fix: only one integration test per module

* feat: add data preview tool for agents

* chore: bump to 0.18.4
diff --git a/.gitignore b/.gitignore
@@ -193,3 +193,5 @@ plexe-full-codebase.txt
 # Example datasets
 examples/datasets/
 examples/datasets/*
+
+**/.claude/settings.local.json
diff --git a/plexe/internal/agents.py b/plexe/internal/agents.py
@@ -23,7 +23,7 @@
 )
 from plexe.internal.models.tools.evaluation import get_review_finalised_model
 from plexe.internal.models.tools.metrics import get_select_target_metric
-from plexe.internal.models.tools.datasets import split_datasets, create_input_sample
+from plexe.internal.models.tools.datasets import split_datasets, create_input_sample, get_dataset_preview
 from plexe.internal.models.tools.execution import get_executor_tool
 from plexe.internal.models.tools.response_formatting import (
     format_final_orchestrator_agent_response,
@@ -107,7 +107,7 @@ def __init__(
                 "- the name and comparison method of the metric to optimise"
             ),
             model=LiteLLMModel(model_id=self.ml_researcher_model_id),
-            tools=[],
+            tools=[get_dataset_preview],
             add_base_tools=False,
             verbosity_level=self.specialist_verbosity,
             prompt_templates=get_prompt_templates("toolcalling_agent.yaml", "mls_prompt_templates.yaml"),
@@ -134,6 +134,7 @@ def __init__(
                 validate_training_code,
                 get_fix_training_code(self.tool_model_id),
                 get_executor_tool(distributed),
+                get_dataset_preview,
                 format_final_mle_agent_response,
             ],
             add_base_tools=False,
@@ -175,6 +176,7 @@ def __init__(
                 get_review_finalised_model(self.tool_model_id),
                 split_datasets,
                 create_input_sample,
+                get_dataset_preview,
                 format_final_orchestrator_agent_response,
             ],
             managed_agents=[self.ml_research_agent, self.mle_agent, self.mlops_engineer],
diff --git a/plexe/internal/models/tools/datasets.py b/plexe/internal/models/tools/datasets.py
@@ -3,11 +3,13 @@
 
 These tools help with dataset operations within the model generation pipeline, including
 splitting datasets into training, validation, and test sets, registering datasets with
-the dataset registry, and creating sample data for validation.
+the dataset registry, creating sample data for validation, and previewing dataset content.
 """
 
 import logging
-from typing import Dict, List
+from typing import Dict, List, Any
+
+import numpy as np
 import pandas as pd
 from smolagents import tool
 
@@ -123,3 +125,66 @@ def create_input_sample(train_dataset_names: List[str], input_schema_fields: Lis
     except Exception as e:
         logger.warning(f"⚠️ Error creating input sample for validation: {str(e)}")
         return False
+
+
+@tool
+def get_dataset_preview(dataset_name: str) -> Dict[str, Any]:
+    """
+    Generate a concise preview of a dataset with statistical information to help agents understand the data.
+
+    Args:
+        dataset_name: Name of the dataset to preview
+
+    Returns:
+        Dictionary containing dataset information:
+        - shape: dimensions of the dataset
+        - dtypes: data types of columns
+        - summary_stats: basic statistics (mean, median, min/max)
+        - missing_values: count of missing values per column
+        - sample_rows: sample of the data (5 rows)
+    """
+    object_registry = ObjectRegistry()
+
+    try:
+        # Get dataset from registry
+        dataset = object_registry.get(TabularConvertible, dataset_name)
+        df = dataset.to_pandas()
+
+        # Basic shape and data types
+        result = {
+            "dataset_name": dataset_name,
+            "shape": {"rows": df.shape[0], "columns": df.shape[1]},
+            "columns": list(df.columns),
+            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
+            "sample_rows": df.head(5).to_dict(orient="records"),
+        }
+
+        # Basic statistics
+        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+        if numeric_cols:
+            stats = df[numeric_cols].describe().to_dict()
+            result["summary_stats"] = {
+                col: {
+                    "mean": stats[col].get("mean"),
+                    "std": stats[col].get("std"),
+                    "min": stats[col].get("min"),
+                    "25%": stats[col].get("25%"),
+                    "median": stats[col].get("50%"),
+                    "75%": stats[col].get("75%"),
+                    "max": stats[col].get("max"),
+                }
+                for col in numeric_cols
+            }
+
+        # Missing values
+        missing_counts = df.isnull().sum().to_dict()
+        result["missing_values"] = {col: count for col, count in missing_counts.items() if count > 0}
+
+        return result
+
+    except Exception as e:
+        logger.warning(f"⚠️ Error creating dataset preview: {str(e)}")
+        return {
+            "error": f"Failed to generate preview for dataset '{dataset_name}': {str(e)}",
+            "dataset_name": dataset_name,
+        }
diff --git a/plexe/templates/prompts/agent/mle_prompt_templates.yaml b/plexe/templates/prompts/agent/mle_prompt_templates.yaml
@@ -20,7 +20,8 @@ managed_agent:
     - The identifier of the LLM to use for code generation.
     
     If the information above was not provided, you should reject the task and request your manager to provide the
-    required information.
+    required information. You can also use the 'get_dataset_preview' tool to get a better understanding of the data
+    in case it helps.
     
     ## Instructions for You
     If you have the required information: generate Python machine learning training code to train a model that solves 
diff --git a/plexe/templates/prompts/agent/mls_prompt_templates.yaml b/plexe/templates/prompts/agent/mls_prompt_templates.yaml
@@ -14,6 +14,10 @@ managed_agent:
     or the LLM to use for plan generation, you should reject the task and ask your manager to provide the required
     information.
 
+    You can use the get_dataset_preview tool to examine the available datasets before formulating your solution plans.
+    This will help you understand the data characteristics (data types, missing values, basic statistics)
+    and propose more targeted approaches. Use the tool by providing a dataset name.
+
     The solution concepts should be explained in 3-5 sentences each. Do not include implementations of the
     solutions, though you can include small code snippets if absolutely required to explain a plan.
     Do not suggest doing EDA, ensembling, or hyperparameter tuning. The solutions should be feasible using only 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plexe"
-version = "0.18.3"
+version = "0.18.4"
 description = "An agentic framework for building ML models from natural language"
 authors = [
     "marcellodebernardi <marcello.debernardi@outlook.com>",