Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion livekit-agents/livekit/agents/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1329,7 +1329,7 @@ def _print_run_event(
)
elif event.type == "function_call_output":
output = event.item.output
display_output = output
display_output = str(output)
is_error = output.lower().startswith("error") or output.lower().startswith("exception")

if not is_error:
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
ChatItem,
ChatMessage,
ChatRole,
FileContent,
FunctionCall,
FunctionCallOutput,
ImageContent,
MetricsReport,
ToolOutput,
)
from .fallback_adapter import AvailabilityChangedEvent, FallbackAdapter
from .llm import (
Expand Down Expand Up @@ -68,7 +70,9 @@
"FunctionCall",
"FunctionCallOutput",
"AudioContent",
"FileContent",
"ImageContent",
"ToolOutput",
"AgentConfigUpdate",
"AgentHandoff",
"MetricsReport",
Expand Down
43 changes: 36 additions & 7 deletions livekit-agents/livekit/agents/llm/_provider_format/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,21 @@ def to_chat_ctx(
}
)
elif msg.type == "function_call_output":
result_content: list[Any] | str = msg.output
try:
parsed = json.loads(msg.output)
if isinstance(parsed, list):
result_content = parsed
except (json.JSONDecodeError, TypeError):
pass
result_content: list[Any] = []
if msg.output.text_contents:
try:
parsed = json.loads(msg.output.text_contents)
result_content = (
parsed
if isinstance(parsed, list)
else [{"type": "text", "text": msg.output.text_contents}]
)
except (json.JSONDecodeError, TypeError):
result_content = [{"type": "text", "text": msg.output.text_contents}]
for img in msg.output.image_contents:
result_content.append(_to_image_content(img))
for f in msg.output.file_contents:
result_content.append(_to_document_content(f))
content.append(
{
"tool_use_id": msg.call_id,
Expand Down Expand Up @@ -101,6 +109,27 @@ def to_chat_ctx(
return messages, AnthropicFormatData(system_messages=system_messages)


def _to_document_content(file: llm.FileContent) -> dict[str, Any]:
if file.mime_type == "text/plain":
data = file.data if isinstance(file.data, str) else file.data.decode("utf-8")
return {
"type": "document",
"source": {"type": "text", "media_type": "text/plain", "data": data},
"title": file.name,
}
# default to base64 PDF for binary types
raw = file.data if isinstance(file.data, bytes) else file.data.encode("utf-8")
return {
"type": "document",
"source": {
"type": "base64",
"media_type": file.mime_type,
"data": base64.b64encode(raw).decode("utf-8"),
},
"title": file.name,
}


def _to_image_content(image: llm.ImageContent) -> dict[str, Any]:
cache_key = "serialized_image"
if cache_key not in image._cache:
Expand Down
15 changes: 9 additions & 6 deletions livekit-agents/livekit/agents/llm/_provider_format/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,19 @@ def to_chat_ctx(
}
)
elif msg.type == "function_call_output":
tool_content: list[dict[str, Any]] = []
if msg.output.text_contents:
tool_content.append({"text": msg.output.text_contents})
for img in msg.output.image_contents:
tool_content.append(_build_image(img))
if not tool_content:
tool_content.append({"text": ""})
current_content.append(
{
"toolResult": {
"toolUseId": msg.call_id,
"content": [
{"json": msg.output}
if isinstance(msg.output, dict)
else {"text": msg.output}
],
"status": "success",
"content": tool_content,
"status": "error" if msg.is_error else "success",
}
}
)
Expand Down
6 changes: 5 additions & 1 deletion livekit-agents/livekit/agents/llm/_provider_format/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ def to_chat_ctx(
fc_part["thought_signature"] = sig
parts.append(fc_part)
elif msg.type == "function_call_output":
response = {"output": msg.output} if not msg.is_error else {"error": msg.output}
response = (
{"output": msg.output.text_contents}
if not msg.is_error
else {"error": msg.output.text_contents}
)
parts.append(
{
"function_response": {
Expand Down
36 changes: 30 additions & 6 deletions livekit-agents/livekit/agents/llm/_provider_format/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,14 @@ def _to_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
}

elif msg.type == "function_call_output":
return {
"role": "tool",
"tool_call_id": msg.call_id,
"content": msg.output,
}
if msg.output.image_contents:
tool_content: list[dict[str, Any]] = []
if msg.output.text_contents:
tool_content.append({"type": "text", "text": msg.output.text_contents})
for img in msg.output.image_contents:
tool_content.append(_to_image_content(img))
return {"role": "tool", "tool_call_id": msg.call_id, "content": tool_content}
return {"role": "tool", "tool_call_id": msg.call_id, "content": msg.output.text_contents}

raise ValueError(f"unsupported message type: {msg.type}")

Expand Down Expand Up @@ -181,15 +184,36 @@ def _to_responses_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
return {"role": msg.role, "content": list_content}

elif msg.type == "function_call_output":
if msg.output.image_contents or msg.output.file_contents:
output: list[dict[str, Any]] = []
if msg.output.text_contents:
output.append({"type": "input_text", "text": msg.output.text_contents})
for img in msg.output.image_contents:
output.append(_to_responses_image_content(img))
for f in msg.output.file_contents:
output.append(_to_responses_file_content(f))
return {"type": "function_call_output", "call_id": msg.call_id, "output": output}
return {
"type": "function_call_output",
"call_id": msg.call_id,
"output": msg.output,
"output": msg.output.text_contents,
}

raise ValueError(f"unsupported message type: {msg.type}")


def _to_responses_file_content(file: llm.FileContent) -> dict[str, Any]:
if isinstance(file.data, bytes):
data = base64.b64encode(file.data).decode("utf-8")
else:
data = base64.b64encode(file.data.encode("utf-8")).decode("utf-8")
return {
"type": "input_file",
"filename": file.name,
"file_data": data,
}


def to_fnc_ctx(tool_ctx: llm.ToolContext, *, strict: bool = True) -> list[dict[str, Any]]:
schemas: list[dict[str, Any]] = []
for tool in tool_ctx.function_tools.values():
Expand Down
148 changes: 146 additions & 2 deletions livekit-agents/livekit/agents/llm/chat_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@

from __future__ import annotations

import base64
import textwrap
import time
from collections.abc import Generator, Sequence
from typing import TYPE_CHECKING, Annotated, Any, Literal, TypeAlias, overload

from pydantic import BaseModel, Field, PrivateAttr, TypeAdapter
from pydantic import BaseModel, Field, GetCoreSchemaHandler, PrivateAttr, TypeAdapter
from pydantic_core import CoreSchema, core_schema
from typing_extensions import TypedDict

from livekit import rtc
Expand Down Expand Up @@ -218,6 +220,148 @@ class AudioContent(BaseModel):
transcript: str | None = None


class FileContent(BaseModel):
"""File/document content for use in tool outputs."""

type: Literal["file_content"] = Field(default="file_content")
name: str
"""Display name of the file."""
data: str | bytes
"""File data — str for text, bytes for binary (e.g. PDF). Bytes are base64-encoded in JSON."""
mime_type: str
"""MIME type, e.g. 'text/plain', 'application/pdf'."""


class ToolOutput(str):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this should be a str.
This doesn't feel right

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally it isn't, but what about existing user code assuming that the output is a string?

 for item in agent_session.chat_ctx.items:
      if isinstance(item, FunctionCallOutput) and item.name == "book_appointment":
          if  item.output == "confirmed": ...

wouldn't this fail if ToolOutput was a list or BaseModel?

"""
Structured output of a function tool call.

Inherits from ``str`` for full backward compatibility — existing code that
treats ``FunctionCallOutput.output`` as a string continues to work with no
changes. Rich content (images, files) is accessible via ``.content``,
``.image_contents``, and ``.file_contents``.

Function tools can return any of the following, all automatically converted:

- A plain string: ``return "Order found: #12345"``
- An image: ``return ImageContent(image="https://...")``
- A file: ``return FileContent(name="report.pdf", data=b"...", mime_type="application/pdf")``
- A tuple mixing content: ``return "Here is the image:", ImageContent(image="https://...")``
- A ``ToolOutput`` directly for full control.
"""

_content: list[ImageContent | FileContent | str]

def __new__(cls, content: list[ImageContent | FileContent | str]) -> ToolOutput:
parts: list[str] = []
text = "\n".join(c for c in content if isinstance(c, str))
if text:
parts.append(text)
for c in content:
if isinstance(c, ImageContent):
parts.append(f"[image: {c.mime_type}]" if c.mime_type else "[image]")
elif isinstance(c, FileContent):
parts.append(f"[file: {c.name}]")
return super().__new__(cls, " ".join(parts))

def __init__(self, content: list[ImageContent | FileContent | str]) -> None:
self._content = content
super().__init__()

@classmethod
def __get_pydantic_core_schema__(
cls, source_type: Any, handler: GetCoreSchemaHandler
) -> CoreSchema:
return core_schema.no_info_plain_validator_function(
cls._coerce,
serialization=core_schema.plain_serializer_function_ser_schema(
lambda v: v._serialize(),
),
)

@classmethod
def _coerce(cls, v: Any) -> ToolOutput:
if isinstance(v, cls):
return v
if isinstance(v, str):
return cls([v])
if isinstance(v, (ImageContent, FileContent)):
return cls([v])
if isinstance(v, tuple):
content: list[ImageContent | FileContent | str] = []
for item in v:
if isinstance(item, (ImageContent, FileContent)):
content.append(item)
else:
content.append(str(item) if item is not None else "")
return cls(content)
if isinstance(v, list):
items: list[ImageContent | FileContent | str] = []
for item in v:
if isinstance(item, (ImageContent, FileContent)):
items.append(item)
elif isinstance(item, dict):
t = item.get("type")
if t == "image_content":
items.append(ImageContent.model_validate(item))
elif t == "file_content":
items.append(FileContent.model_validate(item))
else:
items.append(str(item))
elif isinstance(item, str):
items.append(item)
else:
items.append(str(item) if item is not None else "")
return cls(items)
return cls([str(v) if v is not None else ""])

def _serialize(self) -> str | list[Any]:
if not self.image_contents and not self.file_contents:
return self.text_contents

result: list[Any] = []
for item in self._content:
if isinstance(item, str):
result.append(item)
elif isinstance(item, ImageContent):
d = item.model_dump(mode="json")
if isinstance(item.image, rtc.VideoFrame):
from . import utils as llm_utils # lazy import to avoid circular

img = llm_utils.serialize_image(item)
if img.data_bytes:
mime = img.mime_type or "image/jpeg"
d["image"] = (
f"data:{mime};base64,{base64.b64encode(img.data_bytes).decode('utf-8')}"
)
elif img.external_url:
d["image"] = img.external_url
result.append(d)
elif isinstance(item, FileContent):
result.append(item.model_dump(mode="json"))
return result
Comment on lines +272 to +342
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of that will not be needed if we don't use str?


@property
def content(self) -> list[ImageContent | FileContent | str]:
"""All content items."""
return self._content

@property
def text_contents(self) -> str:
"""All text items joined by newlines."""
return "\n".join(c for c in self._content if isinstance(c, str))

@property
def image_contents(self) -> list[ImageContent]:
"""All image content items."""
return [c for c in self._content if isinstance(c, ImageContent)]

@property
def file_contents(self) -> list[FileContent]:
"""All file content items."""
return [c for c in self._content if isinstance(c, FileContent)]


ChatRole: TypeAlias = Literal["developer", "system", "user", "assistant"]


Expand Down Expand Up @@ -313,7 +457,7 @@ class FunctionCallOutput(BaseModel):
type: Literal["function_call_output"] = Field(default="function_call_output")
name: str = Field(default="")
call_id: str
output: str
output: ToolOutput
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't this just be?
ToolOutput = ImageContent | FileContent | str

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if users wanted to pass multiple items from a function tool

is_error: bool
created_at: float = Field(default_factory=time.time)

Expand Down
Loading
Loading