generative-computing · ajbozarth · Apr 9, 2026 · Apr 2, 2026 · Apr 7, 2026 · Apr 9, 2026
@@ -88,7 +88,7 @@ mkdir -p .bob && ln -s ../.agents/skills .bob/skills
 - Use `...` in `@generative` function bodies
 - Prefer primitives over classes
 - **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples.
-- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls.
+- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. `mot.streaming` (bool) and `mot.ttfb_ms` (float | None) are set automatically in `astream()` — backends do not need to set them. Metrics are automatically recorded by `TokenMetricsPlugin` and `LatencyMetricsPlugin` — don't add manual `record_token_usage_metrics()` or `record_request_duration()` calls.
 
 ## 6. Commits & Hooks
 [Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:`

@@ -1,15 +1,15 @@
 ---
 title: "Metrics"
-description: "Collect token usage metrics and instrument your own code with OpenTelemetry counters, histograms, and up-down counters."
+description: "Collect token usage and latency metrics, and instrument your own code with OpenTelemetry counters, histograms, and up-down counters."
 # diataxis: how-to
 ---
 
 **Prerequisites:** [Telemetry](../evaluation-and-observability/telemetry)
 introduces the environment variables and telemetry architecture. This page
 covers metrics collection in detail.
 
-Mellea automatically tracks token consumption across all backends using
-OpenTelemetry metrics counters. Token metrics follow the
+Mellea automatically tracks token consumption and request latency across all
+backends using OpenTelemetry metrics. Metrics follow the
 [Gen-AI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
 for standardized observability. The metrics API also lets you create your own
 counters, histograms, and up-down counters for application-level instrumentation.
@@ -82,6 +82,57 @@ await mot.astream()
 await mot.avalue()
 ```
 
+## Latency histograms
+
+Mellea tracks request duration and time-to-first-token (TTFB) automatically
+after each LLM call. The `LatencyMetricsPlugin` auto-registers when
+`MELLEA_METRICS_ENABLED=true` alongside `TokenMetricsPlugin`. No code changes
+are required.
+
+### Latency instruments
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.llm.request.duration` | Histogram | `s` | Total request duration, from call to full response |
+| `mellea.llm.ttfb` | Histogram | `s` | Time to first token (streaming requests only) |
+
+### Latency attributes
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
+| `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` |
+| `streaming` | Whether streaming mode was used (duration only) | `True`, `False` |
+
+### Histogram buckets
+
+Custom bucket boundaries are configured for LLM-sized latencies:
+
+- **`mellea.llm.request.duration`**: `0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120` seconds
+- **`mellea.llm.ttfb`**: `0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10` seconds
+
+### Latency recording timing
+
+- **`mellea.llm.request.duration`**: Recorded for every `generate_from_context` call,
+  both streaming and non-streaming.
+- **`mellea.llm.ttfb`**: Recorded only for streaming requests, measuring elapsed time
+  from the `generate_from_context` call until the first chunk arrives.
+
+Access latency data directly from a `ModelOutputThunk`:
+
+```python
+from mellea import start_session
+from mellea.backends import ModelOption
+
+with start_session() as m:
+    result = m.instruct(
+        "Explain quantum entanglement briefly",
+        model_options={ModelOption.STREAM: True},
+    )
+    if result.streaming and result.ttfb_ms is not None:
+        print(f"Time to first token: {result.ttfb_ms:.1f} ms")
+```
+
 ## Metrics export configuration
 
 Mellea supports multiple metrics exporters that can be used independently or
@@ -283,13 +334,14 @@ Check if metrics are enabled:
 from mellea.telemetry import is_metrics_enabled
 
 if is_metrics_enabled():
-    print("Token metrics are being collected")
+    print("Metrics are being collected")
 ```
 
-Access token usage data from a `ModelOutputThunk`:
+Access token usage and latency data from a `ModelOutputThunk`:
 
 ```python
 from mellea import start_session
+from mellea.backends import ModelOption
 
 with start_session() as m:
     result = m.instruct("Write a haiku about programming")
@@ -298,18 +350,29 @@ with start_session() as m:
         print(f"Prompt tokens: {result.usage['prompt_tokens']}")
         print(f"Completion tokens: {result.usage['completion_tokens']}")
         print(f"Total tokens: {result.usage['total_tokens']}")
+
+    # Streaming mode also exposes time-to-first-token
+    streamed = m.instruct(
+        "Describe the solar system",
+        model_options={ModelOption.STREAM: True},
+    )
+    print(f"Streaming: {streamed.streaming}")
+    if streamed.ttfb_ms is not None:
+        print(f"Time to first token: {streamed.ttfb_ms:.1f} ms")
 ```
 
 The `usage` field is a dictionary with three keys: `prompt_tokens`,
 `completion_tokens`, and `total_tokens`. All backends populate this field
-consistently.
+consistently. `streaming` and `ttfb_ms` are set automatically based on whether
+streaming mode was used.
 
 ## Performance
 
 - **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default),
-  the `TokenMetricsPlugin` is not registered and all instrument calls are no-ops.
-- **Minimal overhead when enabled**: Counter increments are extremely fast
-  (~nanoseconds per operation).
+  neither `TokenMetricsPlugin` nor `LatencyMetricsPlugin` is registered and all
+  instrument calls are no-ops.
+- **Minimal overhead when enabled**: Counter increments and histogram recordings
+  are extremely fast (~nanoseconds per operation).
 - **Async export**: Metrics are batched and exported asynchronously (default:
   every 60 seconds).
 - **Non-blocking**: Metric recording never blocks LLM calls.

@@ -116,11 +116,14 @@ exporter configuration (Jaeger, Grafana Tempo, etc.), and debugging guidance.
 
 ## Metrics
 
-Mellea automatically tracks token consumption across all backends using
-OpenTelemetry counters (`mellea.llm.tokens.input` and
-`mellea.llm.tokens.output`). No code changes are required — the
-`TokenMetricsPlugin` records metrics via the plugin hook system after each
-LLM call completes.
+Mellea automatically tracks token consumption and request latency across all
+backends using OpenTelemetry metrics. No code changes are required — two
+plugins hook into the generation pipeline and record metrics automatically:
+
+- **`TokenMetricsPlugin`** — records `mellea.llm.tokens.input` and
+  `mellea.llm.tokens.output` counters after each LLM call.
+- **`LatencyMetricsPlugin`** — records `mellea.llm.request.duration` (every
+  request) and `mellea.llm.ttfb` (streaming requests only) histograms.
 
 The metrics API also exposes `create_counter`, `create_histogram`, and
 `create_up_down_counter` for instrumenting your own application code.
@@ -131,8 +134,9 @@ Mellea supports three exporters that can run simultaneously:
 - **OTLP** — export to production observability platforms
 - **Prometheus** — register with `prometheus_client` for scraping
 
-See [Metrics](../evaluation-and-observability/metrics) for token usage details,
-backend support matrix, exporter setup, custom instruments, and troubleshooting.
+See [Metrics](../evaluation-and-observability/metrics) for token usage and
+latency details, backend support matrix, exporter setup, custom instruments,
+and troubleshooting.
 
 ## Logging
 

@@ -2,7 +2,7 @@
 
 """Example demonstrating OpenTelemetry metrics exporters in Mellea.
 
-This example shows how to use token usage metrics with different exporters:
+This example shows how to use metrics with different exporters:
 - Console: Print metrics to console for debugging
 - OTLP: Export to OpenTelemetry Protocol collectors
 - Prometheus: Expose HTTP endpoint for Prometheus scraping
@@ -42,6 +42,7 @@
 import os
 
 from mellea import generative, start_session
+from mellea.backends import ModelOption
 from mellea.stdlib.requirements import req
 
 
@@ -58,7 +59,7 @@ def translate_to_spanish(text: str) -> str:
 def main():
     """Run example with metrics collection."""
     print("=" * 60)
-    print("Mellea Token Metrics Example")
+    print("Mellea Metrics Example")
     print("=" * 60)
 
     # Check if metrics are enabled
@@ -70,7 +71,7 @@ def main():
         print("=" * 60)
         return
 
-    print("✓ Token metrics enabled")
+    print("Metrics enabled")
 
     # When Prometheus is enabled, start an HTTP server to expose metrics
     if os.getenv("MELLEA_METRICS_PROMETHEUS", "false").lower() == "true":
@@ -118,8 +119,18 @@ def main():
         response = m.chat("What is the capital of France?")
         print(f"Response: {str(response)[:100]}...")
 
+        # Example 5: Streaming with latency metrics
+        print("\n5. Streaming request (latency metrics)...")
+        streamed = m.instruct(
+            "Name three programming languages in one sentence.",
+            model_options={ModelOption.STREAM: True},
+        )
+        print(f"Response: {str(streamed)[:100]}...")
+        if streamed.streaming and streamed.ttfb_ms is not None:
+            print(f"  -> Time to first token: {streamed.ttfb_ms:.1f} ms")
+
     print("\n" + "=" * 60)
-    print("Example complete! Token metrics recorded.")
+    print("Example complete! Metrics recorded.")
 
     # When Prometheus is enabled, keep the process running so the endpoint can be scraped
     if os.getenv("MELLEA_METRICS_PROMETHEUS", "false").lower() == "true":

@@ -307,6 +307,19 @@ def __init__(
         Populated by backends. None if unavailable.
         """
 
+        self.ttfb_ms: float | None = None
+        """Time to first token in milliseconds (streaming only).
+
+        Set when the first chunk is received from the backend.
+        None for non-streaming requests or when not measured.
+        """
+
+        self.streaming: bool = False
+        """Whether this generation used streaming mode.
+
+        Set from model options at the start of astream().
+        """
+
         # Used for tracking generation.
         self._context: list[Component | CBlock] | None = None
         self._action: Component | CBlock | None = None
@@ -329,8 +342,21 @@ def __init__(
         self._on_computed: Callable[[ModelOutputThunk], Coroutine] | None = None
 
         self._start: datetime.datetime | None = None
+        self._first_chunk_received: bool = False
         self._generate_log: GenerateLog | None = None
 
+    def _record_ttfb(self) -> None:
+        """Record time-to-first-byte if streaming and not yet recorded."""
+        if (
+            self.streaming
+            and not self._first_chunk_received
+            and self._start is not None
+        ):
+            self.ttfb_ms = (
+                datetime.datetime.now() - self._start
+            ).total_seconds() * 1000
+            self._first_chunk_received = True
+
     def _copy_from(self, other: ModelOutputThunk) -> None:
         """Copy computed-output fields from *other* into *self*.
 
@@ -346,6 +372,8 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
         self.usage = other.usage
         self.model = other.model
         self.provider = other.provider
+        self.ttfb_ms = other.ttfb_ms
+        self.streaming = other.streaming
         self._generate_log = other._generate_log
 
     def is_computed(self) -> bool:
@@ -419,6 +447,9 @@ async def astream(self) -> str:
             )
 
         do_set_computed = False
+        # Use string directly to avoid importing ModelOption from backends into core (circular import).
+        # ModelOption.STREAM is defined in mellea/backends/model_options.py.
+        self.streaming = bool((self._model_options or {}).get("@@@stream@@@", False))
 
         if not self._generate_type == GenerateType.ASYNC:
             raise RuntimeError(
@@ -435,6 +466,7 @@ async def astream(self) -> str:
             try:
                 item = self._async_queue.get_nowait()
                 chunks.append(item)
+                self._record_ttfb()
             except asyncio.QueueEmpty:
                 # We've exhausted the current items in the queue.
                 break
@@ -450,6 +482,7 @@ async def astream(self) -> str:
 
             item = await self._async_queue.get()
             chunks.append(item)
+            self._record_ttfb()
 
         # Process the sentinel value if it's there.
         if chunks[-1] is None:
@@ -562,6 +595,8 @@ def __copy__(self) -> ModelOutputThunk:
         copied.usage = self.usage
         copied.model = self.model
         copied.provider = self.provider
+        copied.ttfb_ms = self.ttfb_ms
+        copied.streaming = self.streaming
         return copied
 
     def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
@@ -594,6 +629,8 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
         deepcopied.usage = deepcopy(self.usage) if self.usage else None
         deepcopied.model = self.model
         deepcopied.provider = self.provider
+        deepcopied.ttfb_ms = self.ttfb_ms
+        deepcopied.streaming = self.streaming
         return deepcopied
 
 

@@ -67,7 +67,9 @@ def my_function():
     create_histogram,
     create_up_down_counter,
     is_metrics_enabled,
+    record_request_duration,
     record_token_usage_metrics,
+    record_ttfb,
 )
 from .tracing import (
     end_backend_span,
@@ -89,7 +91,9 @@ def my_function():
     "is_application_tracing_enabled",
     "is_backend_tracing_enabled",
     "is_metrics_enabled",
+    "record_request_duration",
     "record_token_usage_metrics",
+    "record_ttfb",
     "set_span_attribute",
     "set_span_error",
     "start_backend_span",