livekit · tinalenguyen · Mar 11, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/livekit-agents/livekit/agents/cli/cli.py b/livekit-agents/livekit/agents/cli/cli.py
@@ -1329,7 +1329,7 @@ def _print_run_event(
         )
     elif event.type == "function_call_output":
         output = event.item.output
-        display_output = output
+        display_output = str(output)
         is_error = output.lower().startswith("error") or output.lower().startswith("exception")
 
         if not is_error:

diff --git a/livekit-agents/livekit/agents/llm/__init__.py b/livekit-agents/livekit/agents/llm/__init__.py
@@ -8,10 +8,12 @@
     ChatItem,
     ChatMessage,
     ChatRole,
+    FileContent,
     FunctionCall,
     FunctionCallOutput,
     ImageContent,
     MetricsReport,
+    ToolOutput,
 )
 from .fallback_adapter import AvailabilityChangedEvent, FallbackAdapter
 from .llm import (
@@ -68,7 +70,9 @@
     "FunctionCall",
     "FunctionCallOutput",
     "AudioContent",
+    "FileContent",
     "ImageContent",
+    "ToolOutput",
     "AgentConfigUpdate",
     "AgentHandoff",
     "MetricsReport",

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/anthropic.py b/livekit-agents/livekit/agents/llm/_provider_format/anthropic.py
@@ -64,13 +64,21 @@ def to_chat_ctx(
                 }
             )
         elif msg.type == "function_call_output":
-            result_content: list[Any] | str = msg.output
-            try:
-                parsed = json.loads(msg.output)
-                if isinstance(parsed, list):
-                    result_content = parsed
-            except (json.JSONDecodeError, TypeError):
-                pass
+            result_content: list[Any] = []
+            if msg.output.text_contents:
+                try:
+                    parsed = json.loads(msg.output.text_contents)
+                    result_content = (
+                        parsed
+                        if isinstance(parsed, list)
+                        else [{"type": "text", "text": msg.output.text_contents}]
+                    )
+                except (json.JSONDecodeError, TypeError):
+                    result_content = [{"type": "text", "text": msg.output.text_contents}]
+            for img in msg.output.image_contents:
+                result_content.append(_to_image_content(img))
+            for f in msg.output.file_contents:
+                result_content.append(_to_document_content(f))
             content.append(
                 {
                     "tool_use_id": msg.call_id,
@@ -101,6 +109,27 @@ def to_chat_ctx(
     return messages, AnthropicFormatData(system_messages=system_messages)
 
 
+def _to_document_content(file: llm.FileContent) -> dict[str, Any]:
+    if file.mime_type == "text/plain":
+        data = file.data if isinstance(file.data, str) else file.data.decode("utf-8")
+        return {
+            "type": "document",
+            "source": {"type": "text", "media_type": "text/plain", "data": data},
+            "title": file.name,
+        }
+    # default to base64 PDF for binary types
+    raw = file.data if isinstance(file.data, bytes) else file.data.encode("utf-8")
+    return {
+        "type": "document",
+        "source": {
+            "type": "base64",
+            "media_type": file.mime_type,
+            "data": base64.b64encode(raw).decode("utf-8"),
+        },
+        "title": file.name,
+    }
+
+
 def _to_image_content(image: llm.ImageContent) -> dict[str, Any]:
     cache_key = "serialized_image"
     if cache_key not in image._cache:

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/aws.py b/livekit-agents/livekit/agents/llm/_provider_format/aws.py
@@ -59,16 +59,19 @@ def to_chat_ctx(
                 }
             )
         elif msg.type == "function_call_output":
+            tool_content: list[dict[str, Any]] = []
+            if msg.output.text_contents:
+                tool_content.append({"text": msg.output.text_contents})
+            for img in msg.output.image_contents:
+                tool_content.append(_build_image(img))
+            if not tool_content:
+                tool_content.append({"text": ""})
             current_content.append(
                 {
                     "toolResult": {
                         "toolUseId": msg.call_id,
-                        "content": [
-                            {"json": msg.output}
-                            if isinstance(msg.output, dict)
-                            else {"text": msg.output}
-                        ],
-                        "status": "success",
+                        "content": tool_content,
+                        "status": "error" if msg.is_error else "success",
                     }
                 }
             )

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/google.py b/livekit-agents/livekit/agents/llm/_provider_format/google.py
@@ -68,7 +68,11 @@ def to_chat_ctx(
                 fc_part["thought_signature"] = sig
             parts.append(fc_part)
         elif msg.type == "function_call_output":
-            response = {"output": msg.output} if not msg.is_error else {"error": msg.output}
+            response = (
+                {"output": msg.output.text_contents}
+                if not msg.is_error
+                else {"error": msg.output.text_contents}
+            )
             parts.append(
                 {
                     "function_response": {

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/openai.py b/livekit-agents/livekit/agents/llm/_provider_format/openai.py
@@ -85,11 +85,14 @@ def _to_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
         }
 
     elif msg.type == "function_call_output":
-        return {
-            "role": "tool",
-            "tool_call_id": msg.call_id,
-            "content": msg.output,
-        }
+        if msg.output.image_contents:
+            tool_content: list[dict[str, Any]] = []
+            if msg.output.text_contents:
+                tool_content.append({"type": "text", "text": msg.output.text_contents})
+            for img in msg.output.image_contents:
+                tool_content.append(_to_image_content(img))
+            return {"role": "tool", "tool_call_id": msg.call_id, "content": tool_content}
+        return {"role": "tool", "tool_call_id": msg.call_id, "content": msg.output.text_contents}
 
     raise ValueError(f"unsupported message type: {msg.type}")
 
@@ -181,15 +184,36 @@ def _to_responses_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
         return {"role": msg.role, "content": list_content}
 
     elif msg.type == "function_call_output":
+        if msg.output.image_contents or msg.output.file_contents:
+            output: list[dict[str, Any]] = []
+            if msg.output.text_contents:
+                output.append({"type": "input_text", "text": msg.output.text_contents})
+            for img in msg.output.image_contents:
+                output.append(_to_responses_image_content(img))
+            for f in msg.output.file_contents:
+                output.append(_to_responses_file_content(f))
+            return {"type": "function_call_output", "call_id": msg.call_id, "output": output}
         return {
             "type": "function_call_output",
             "call_id": msg.call_id,
-            "output": msg.output,
+            "output": msg.output.text_contents,
         }
 
     raise ValueError(f"unsupported message type: {msg.type}")
 
 
+def _to_responses_file_content(file: llm.FileContent) -> dict[str, Any]:
+    if isinstance(file.data, bytes):
+        data = base64.b64encode(file.data).decode("utf-8")
+    else:
+        data = base64.b64encode(file.data.encode("utf-8")).decode("utf-8")
+    return {
+        "type": "input_file",
+        "filename": file.name,
+        "file_data": data,
+    }
+
+
 def to_fnc_ctx(tool_ctx: llm.ToolContext, *, strict: bool = True) -> list[dict[str, Any]]:
     schemas: list[dict[str, Any]] = []
     for tool in tool_ctx.function_tools.values():

diff --git a/livekit-agents/livekit/agents/llm/chat_context.py b/livekit-agents/livekit/agents/llm/chat_context.py
@@ -14,12 +14,14 @@
 
 from __future__ import annotations
 
+import base64
 import textwrap
 import time
 from collections.abc import Generator, Sequence
 from typing import TYPE_CHECKING, Annotated, Any, Literal, TypeAlias, overload
 
-from pydantic import BaseModel, Field, PrivateAttr, TypeAdapter
+from pydantic import BaseModel, Field, GetCoreSchemaHandler, PrivateAttr, TypeAdapter
+from pydantic_core import CoreSchema, core_schema
 from typing_extensions import TypedDict
 
 from livekit import rtc
@@ -218,6 +220,148 @@ class AudioContent(BaseModel):
     transcript: str | None = None
 
 
+class FileContent(BaseModel):
+    """File/document content for use in tool outputs."""
+
+    type: Literal["file_content"] = Field(default="file_content")
+    name: str
+    """Display name of the file."""
+    data: str | bytes
+    """File data — str for text, bytes for binary (e.g. PDF). Bytes are base64-encoded in JSON."""
+    mime_type: str
+    """MIME type, e.g. 'text/plain', 'application/pdf'."""
+
+
+class ToolOutput(str):
+    """
+    Structured output of a function tool call.
+
+    Inherits from ``str`` for full backward compatibility — existing code that
+    treats ``FunctionCallOutput.output`` as a string continues to work with no
+    changes. Rich content (images, files) is accessible via ``.content``,
+    ``.image_contents``, and ``.file_contents``.
+
+    Function tools can return any of the following, all automatically converted:
+
+    - A plain string: ``return "Order found: #12345"``
+    - An image: ``return ImageContent(image="https://...")``
+    - A file: ``return FileContent(name="report.pdf", data=b"...", mime_type="application/pdf")``
+    - A tuple mixing content: ``return "Here is the image:", ImageContent(image="https://...")``
+    - A ``ToolOutput`` directly for full control.
+    """
+
+    _content: list[ImageContent | FileContent | str]
+
+    def __new__(cls, content: list[ImageContent | FileContent | str]) -> ToolOutput:
+        parts: list[str] = []
+        text = "\n".join(c for c in content if isinstance(c, str))
+        if text:
+            parts.append(text)
+        for c in content:
+            if isinstance(c, ImageContent):
+                parts.append(f"[image: {c.mime_type}]" if c.mime_type else "[image]")
+            elif isinstance(c, FileContent):
+                parts.append(f"[file: {c.name}]")
+        return super().__new__(cls, " ".join(parts))
+
+    def __init__(self, content: list[ImageContent | FileContent | str]) -> None:
+        self._content = content
+        super().__init__()
+
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> CoreSchema:
+        return core_schema.no_info_plain_validator_function(
+            cls._coerce,
+            serialization=core_schema.plain_serializer_function_ser_schema(
+                lambda v: v._serialize(),
+            ),
+        )
+
+    @classmethod
+    def _coerce(cls, v: Any) -> ToolOutput:
+        if isinstance(v, cls):
+            return v
+        if isinstance(v, str):
+            return cls([v])
+        if isinstance(v, (ImageContent, FileContent)):
+            return cls([v])
+        if isinstance(v, tuple):
+            content: list[ImageContent | FileContent | str] = []
+            for item in v:
+                if isinstance(item, (ImageContent, FileContent)):
+                    content.append(item)
+                else:
+                    content.append(str(item) if item is not None else "")
+            return cls(content)
+        if isinstance(v, list):
+            items: list[ImageContent | FileContent | str] = []
+            for item in v:
+                if isinstance(item, (ImageContent, FileContent)):
+                    items.append(item)
+                elif isinstance(item, dict):
+                    t = item.get("type")
+                    if t == "image_content":
+                        items.append(ImageContent.model_validate(item))
+                    elif t == "file_content":
+                        items.append(FileContent.model_validate(item))
+                    else:
+                        items.append(str(item))
+                elif isinstance(item, str):
+                    items.append(item)
+                else:
+                    items.append(str(item) if item is not None else "")
+            return cls(items)
+        return cls([str(v) if v is not None else ""])
+
+    def _serialize(self) -> str | list[Any]:
+        if not self.image_contents and not self.file_contents:
+            return self.text_contents
+
+        result: list[Any] = []
+        for item in self._content:
+            if isinstance(item, str):
+                result.append(item)
+            elif isinstance(item, ImageContent):
+                d = item.model_dump(mode="json")
+                if isinstance(item.image, rtc.VideoFrame):
+                    from . import utils as llm_utils  # lazy import to avoid circular
+
+                    img = llm_utils.serialize_image(item)
+                    if img.data_bytes:
+                        mime = img.mime_type or "image/jpeg"
+                        d["image"] = (
+                            f"data:{mime};base64,{base64.b64encode(img.data_bytes).decode('utf-8')}"
+                        )
+                    elif img.external_url:
+                        d["image"] = img.external_url
+                result.append(d)
+            elif isinstance(item, FileContent):
+                result.append(item.model_dump(mode="json"))
+        return result
+
+    @property
+    def content(self) -> list[ImageContent | FileContent | str]:
+        """All content items."""
+        return self._content
+
+    @property
+    def text_contents(self) -> str:
+        """All text items joined by newlines."""
+        return "\n".join(c for c in self._content if isinstance(c, str))
+
+    @property
+    def image_contents(self) -> list[ImageContent]:
+        """All image content items."""
+        return [c for c in self._content if isinstance(c, ImageContent)]
+
+    @property
+    def file_contents(self) -> list[FileContent]:
+        """All file content items."""
+        return [c for c in self._content if isinstance(c, FileContent)]
+
+
 ChatRole: TypeAlias = Literal["developer", "system", "user", "assistant"]
 
 
@@ -313,7 +457,7 @@ class FunctionCallOutput(BaseModel):
     type: Literal["function_call_output"] = Field(default="function_call_output")
     name: str = Field(default="")
     call_id: str
-    output: str
+    output: ToolOutput
     is_error: bool
     created_at: float = Field(default_factory=time.time)