Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions livekit-agents/livekit/agents/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@ class STTMetrics(BaseModel):
"""The duration of the pushed audio in seconds."""
streamed: bool
"""Whether the STT is streaming (e.g using websocket)."""

# NEW: Token usage fields
input_tokens: int = 0
"""Total input tokens used (audio + text tokens)."""
output_tokens: int = 0
"""Total output tokens generated."""
total_tokens: int = 0
"""Total tokens used (input + output)."""
audio_tokens: int = 0
"""Number of audio tokens in input."""
text_tokens: int = 0
"""Number of text tokens in input (e.g., from prompt)."""

metadata: Metadata | None = None


Expand Down
12 changes: 12 additions & 0 deletions livekit-agents/livekit/agents/metrics/usage_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ class UsageSummary:
tts_audio_duration: float = 0.0
stt_audio_duration: float = 0.0

# STT token usage tracking fields
stt_input_tokens: int = 0
stt_output_tokens: int = 0
stt_total_tokens: int = 0
stt_audio_tokens: int = 0
stt_text_tokens: int = 0

# properties for naming consistency: prompt = input, completion = output
@property
def llm_input_tokens(self) -> int:
Expand Down Expand Up @@ -87,6 +94,11 @@ def collect(self, metrics: AgentMetrics) -> None:

elif isinstance(metrics, STTMetrics):
self._summary.stt_audio_duration += metrics.audio_duration
self._summary.stt_input_tokens += metrics.input_tokens
self._summary.stt_output_tokens += metrics.output_tokens
self._summary.stt_total_tokens += metrics.total_tokens
self._summary.stt_audio_tokens += metrics.audio_tokens
self._summary.stt_text_tokens += metrics.text_tokens

def get_summary(self) -> UsageSummary:
return deepcopy(self._summary)
42 changes: 42 additions & 0 deletions livekit-agents/livekit/agents/stt/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Generic, Literal, TypeVar, Union

from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import TypedDict

from livekit import rtc
from livekit.agents.metrics.base import Metadata
Expand Down Expand Up @@ -66,12 +67,32 @@ class RecognitionUsage:
audio_duration: float


class STTTokenUsage(TypedDict, total=False):
"""Token usage information from STT transcription.

All fields are optional as not all STT providers support token usage tracking.
"""

input_tokens: int
"""Total input tokens used (audio + text tokens)."""
output_tokens: int
"""Total output tokens generated."""
total_tokens: int
"""Total tokens used (input + output)."""
audio_tokens: int
"""Number of audio tokens in input."""
text_tokens: int
"""Number of text tokens in input (e.g., from prompt)."""


@dataclass
class SpeechEvent:
type: SpeechEventType
request_id: str = ""
alternatives: list[SpeechData] = field(default_factory=list)
recognition_usage: RecognitionUsage | None = None
token_usage: STTTokenUsage | None = None
"""Token usage information from STT transcription, if available."""


@dataclass
Expand Down Expand Up @@ -163,13 +184,34 @@ async def recognize(
)
if self._recognize_metrics_needed:
duration = time.perf_counter() - start_time

# Extract token usage if available
input_tokens = 0
output_tokens = 0
total_tokens = 0
audio_tokens = 0
text_tokens = 0

if event.token_usage:
usage = event.token_usage
input_tokens = usage.get("input_tokens", 0)
output_tokens = usage.get("output_tokens", 0)
total_tokens = usage.get("total_tokens", 0)
audio_tokens = usage.get("audio_tokens", 0)
text_tokens = usage.get("text_tokens", 0)

stt_metrics = STTMetrics(
request_id=event.request_id,
timestamp=time.time(),
duration=duration,
label=self._label,
audio_duration=calculate_audio_duration(buffer),
streamed=False,
input_tokens=input_tokens,
output_tokens=output_tokens,
total_tokens=total_tokens,
audio_tokens=audio_tokens,
text_tokens=text_tokens,
metadata=Metadata(
model_name=self.model,
model_provider=self.provider,
Expand Down
21 changes: 21 additions & 0 deletions livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
if not detected_lg and self._opts.language:
detected_lg = self._opts.language[0]

audio_duration = evt.result.duration / 10**7 # Convert from 100ns ticks to seconds

# TODO: @chenghao-mou get confidence from NBest with `detailed` output format
final_data = stt.SpeechData(
language=detected_lg,
Expand All @@ -298,6 +300,25 @@ def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
),
)
# Emit usage metrics after final transcript
# Azure doesn't provide token counts, so we set them to 0
# but track audio duration for billing purposes
self._loop.call_soon_threadsafe(
self._emit_recognition_usage,
evt.result.result_id,
audio_duration,
)

def _emit_recognition_usage(self, request_id: str, audio_duration: float) -> None:
"""Emit usage metrics for Azure STT (duration-based, no tokens)"""
self._event_ch.send_nowait(
stt.SpeechEvent(
type=stt.SpeechEventType.RECOGNITION_USAGE,
request_id=request_id,
alternatives=[],
recognition_usage=stt.RecognitionUsage(audio_duration=audio_duration),
),
)

def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,10 +436,39 @@ async def _recognize_impl(
if isinstance(resp, TranscriptionVerbose) and resp.language:
sd.language = resp.language

return stt.SpeechEvent(
# Extract token usage if available
input_tokens = 0
output_tokens = 0
total_tokens = 0
audio_tokens = 0
text_tokens = 0
if hasattr(resp, "usage") and resp.usage:
usage = resp.usage
input_tokens = getattr(usage, "input_tokens", 0)
output_tokens = getattr(usage, "output_tokens", 0)
total_tokens = getattr(usage, "total_tokens", 0)

# Extract detailed token breakdown
if hasattr(usage, "input_token_details") and usage.input_token_details:
details = usage.input_token_details
audio_tokens = getattr(details, "audio_tokens", 0)
text_tokens = getattr(details, "text_tokens", 0)

# Create the speech event with token usage
speech_event = stt.SpeechEvent(
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
alternatives=[sd],
token_usage={
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"total_tokens": total_tokens,
"audio_tokens": audio_tokens,
"text_tokens": text_tokens,
}
if (input_tokens > 0 or output_tokens > 0 or total_tokens > 0)
else None,
)
return speech_event
Comment on lines +439 to +471
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Don’t drop audio/text usage when totals are missing.

If only detailed tokens are present, token_usage becomes None and metrics lose audio/text counts.

✅ Suggested fix
-            speech_event = stt.SpeechEvent(
+            has_usage = any(
+                token > 0
+                for token in (input_tokens, output_tokens, total_tokens, audio_tokens, text_tokens)
+            )
+            speech_event = stt.SpeechEvent(
                 type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                 alternatives=[sd],
                 token_usage={
                     "input_tokens": input_tokens,
                     "output_tokens": output_tokens,
                     "total_tokens": total_tokens,
                     "audio_tokens": audio_tokens,
                     "text_tokens": text_tokens,
                 }
-                if (input_tokens > 0 or output_tokens > 0 or total_tokens > 0)
-                else None,
+                if has_usage
+                else None,
             )
🤖 Prompt for AI Agents
In `@livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/stt.py` around
lines 439 - 471, The current construction of stt.SpeechEvent sets token_usage to
None when input/output/total tokens are zero, which drops
audio_tokens/text_tokens if only detailed counts exist; update the logic in the
block that builds token_usage (around resp/usage handling and the
stt.SpeechEvent creation) so you always populate the token_usage dict with
input_tokens, output_tokens, total_tokens, audio_tokens, and text_tokens and
then set token_usage to that dict if any of those five values is non-zero (e.g.,
use a any(...) check on the dict values) instead of checking only
input/output/total; reference the resp/usage extraction and the
stt.SpeechEvent(...) call to locate where to change the condition.


except openai.APITimeoutError:
raise APITimeoutError() from None
Expand Down
7 changes: 6 additions & 1 deletion tests/fake_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,13 @@ def clear_buffer(self) -> None:
self._flush_handle.cancel()

self._flush_handle = None
# Calculate played duration based on real elapsed time, capped at pushed duration
# This matches the behavior of ConsoleAudioOutput and accounts for speed_factor
# in tests (check_timestamp multiplies by speed_factor to convert to test time)
played_duration = time.time() - self._start_time
played_duration = min(max(0, played_duration), self._pushed_duration)
self.on_playback_finished(
playback_position=min(self._pushed_duration, time.time() - self._start_time),
playback_position=played_duration,
interrupted=True,
synchronized_transcript=None,
)
Expand Down