livekit · Sahilgul · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026
diff --git a/livekit-agents/livekit/agents/metrics/base.py b/livekit-agents/livekit/agents/metrics/base.py
@@ -38,6 +38,19 @@ class STTMetrics(BaseModel):
     """The duration of the pushed audio in seconds."""
     streamed: bool
     """Whether the STT is streaming (e.g using websocket)."""
+
+    # NEW: Token usage fields
+    input_tokens: int = 0
+    """Total input tokens used (audio + text tokens)."""
+    output_tokens: int = 0
+    """Total output tokens generated."""
+    total_tokens: int = 0
+    """Total tokens used (input + output)."""
+    audio_tokens: int = 0
+    """Number of audio tokens in input."""
+    text_tokens: int = 0
+    """Number of text tokens in input (e.g., from prompt)."""
+
     metadata: Metadata | None = None
 
 

diff --git a/livekit-agents/livekit/agents/metrics/usage_collector.py b/livekit-agents/livekit/agents/metrics/usage_collector.py
@@ -22,6 +22,13 @@ class UsageSummary:
     tts_audio_duration: float = 0.0
     stt_audio_duration: float = 0.0
 
+    # STT token usage tracking fields
+    stt_input_tokens: int = 0
+    stt_output_tokens: int = 0
+    stt_total_tokens: int = 0
+    stt_audio_tokens: int = 0
+    stt_text_tokens: int = 0
+
     # properties for naming consistency: prompt = input, completion = output
     @property
     def llm_input_tokens(self) -> int:
@@ -87,6 +94,11 @@ def collect(self, metrics: AgentMetrics) -> None:
 
         elif isinstance(metrics, STTMetrics):
             self._summary.stt_audio_duration += metrics.audio_duration
+            self._summary.stt_input_tokens += metrics.input_tokens
+            self._summary.stt_output_tokens += metrics.output_tokens
+            self._summary.stt_total_tokens += metrics.total_tokens
+            self._summary.stt_audio_tokens += metrics.audio_tokens
+            self._summary.stt_text_tokens += metrics.text_tokens
 
     def get_summary(self) -> UsageSummary:
         return deepcopy(self._summary)
diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py
@@ -10,6 +10,7 @@
 from typing import Generic, Literal, TypeVar, Union
 
 from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import TypedDict
 
 from livekit import rtc
 from livekit.agents.metrics.base import Metadata
@@ -66,12 +67,32 @@ class RecognitionUsage:
     audio_duration: float
 
 
+class STTTokenUsage(TypedDict, total=False):
+    """Token usage information from STT transcription.
+
+    All fields are optional as not all STT providers support token usage tracking.
+    """
+
+    input_tokens: int
+    """Total input tokens used (audio + text tokens)."""
+    output_tokens: int
+    """Total output tokens generated."""
+    total_tokens: int
+    """Total tokens used (input + output)."""
+    audio_tokens: int
+    """Number of audio tokens in input."""
+    text_tokens: int
+    """Number of text tokens in input (e.g., from prompt)."""
+
+
 @dataclass
 class SpeechEvent:
     type: SpeechEventType
     request_id: str = ""
     alternatives: list[SpeechData] = field(default_factory=list)
     recognition_usage: RecognitionUsage | None = None
+    token_usage: STTTokenUsage | None = None
+    """Token usage information from STT transcription, if available."""
 
 
 @dataclass
@@ -163,13 +184,34 @@ async def recognize(
                 )
                 if self._recognize_metrics_needed:
                     duration = time.perf_counter() - start_time
+
+                    # Extract token usage if available
+                    input_tokens = 0
+                    output_tokens = 0
+                    total_tokens = 0
+                    audio_tokens = 0
+                    text_tokens = 0
+
+                    if event.token_usage:
+                        usage = event.token_usage
+                        input_tokens = usage.get("input_tokens", 0)
+                        output_tokens = usage.get("output_tokens", 0)
+                        total_tokens = usage.get("total_tokens", 0)
+                        audio_tokens = usage.get("audio_tokens", 0)
+                        text_tokens = usage.get("text_tokens", 0)
+
                     stt_metrics = STTMetrics(
                         request_id=event.request_id,
                         timestamp=time.time(),
                         duration=duration,
                         label=self._label,
                         audio_duration=calculate_audio_duration(buffer),
                         streamed=False,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        total_tokens=total_tokens,
+                        audio_tokens=audio_tokens,
+                        text_tokens=text_tokens,
                         metadata=Metadata(
                             model_name=self.model,
                             model_provider=self.provider,

diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
@@ -282,6 +282,8 @@ def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
         if not detected_lg and self._opts.language:
             detected_lg = self._opts.language[0]
 
+        audio_duration = evt.result.duration / 10**7  # Convert from 100ns ticks to seconds
+
         # TODO: @chenghao-mou get confidence from NBest with `detailed` output format
         final_data = stt.SpeechData(
             language=detected_lg,
@@ -298,6 +300,25 @@ def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
                     type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
                 ),
             )
+            # Emit usage metrics after final transcript
+            # Azure doesn't provide token counts, so we set them to 0
+            # but track audio duration for billing purposes
+            self._loop.call_soon_threadsafe(
+                self._emit_recognition_usage,
+                evt.result.result_id,
+                audio_duration,
+            )
+
+    def _emit_recognition_usage(self, request_id: str, audio_duration: float) -> None:
+        """Emit usage metrics for Azure STT (duration-based, no tokens)"""
+        self._event_ch.send_nowait(
+            stt.SpeechEvent(
+                type=stt.SpeechEventType.RECOGNITION_USAGE,
+                request_id=request_id,
+                alternatives=[],
+                recognition_usage=stt.RecognitionUsage(audio_duration=audio_duration),
+            ),
+        )
 
     def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
         detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/stt.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/stt.py
@@ -436,10 +436,39 @@ async def _recognize_impl(
             if isinstance(resp, TranscriptionVerbose) and resp.language:
                 sd.language = resp.language
 
-            return stt.SpeechEvent(
+            # Extract token usage if available
+            input_tokens = 0
+            output_tokens = 0
+            total_tokens = 0
+            audio_tokens = 0
+            text_tokens = 0
+            if hasattr(resp, "usage") and resp.usage:
+                usage = resp.usage
+                input_tokens = getattr(usage, "input_tokens", 0)
+                output_tokens = getattr(usage, "output_tokens", 0)
+                total_tokens = getattr(usage, "total_tokens", 0)
+
+                # Extract detailed token breakdown
+                if hasattr(usage, "input_token_details") and usage.input_token_details:
+                    details = usage.input_token_details
+                    audio_tokens = getattr(details, "audio_tokens", 0)
+                    text_tokens = getattr(details, "text_tokens", 0)
+
+            # Create the speech event with token usage
+            speech_event = stt.SpeechEvent(
                 type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                 alternatives=[sd],
+                token_usage={
+                    "input_tokens": input_tokens,
+                    "output_tokens": output_tokens,
+                    "total_tokens": total_tokens,
+                    "audio_tokens": audio_tokens,
+                    "text_tokens": text_tokens,
+                }
+                if (input_tokens > 0 or output_tokens > 0 or total_tokens > 0)
+                else None,
             )
+            return speech_event
 
         except openai.APITimeoutError:
             raise APITimeoutError() from None

diff --git a/tests/fake_io.py b/tests/fake_io.py
@@ -80,8 +80,13 @@ def clear_buffer(self) -> None:
             self._flush_handle.cancel()
 
         self._flush_handle = None
+        # Calculate played duration based on real elapsed time, capped at pushed duration
+        # This matches the behavior of ConsoleAudioOutput and accounts for speed_factor
+        # in tests (check_timestamp multiplies by speed_factor to convert to test time)
+        played_duration = time.time() - self._start_time
+        played_duration = min(max(0, played_duration), self._pushed_duration)
         self.on_playback_finished(
-            playback_position=min(self._pushed_duration, time.time() - self._start_time),
+            playback_position=played_duration,
             interrupted=True,
             synchronized_transcript=None,
         )