diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..07e3ac6 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,25 @@ +# AGENTS.md + +## Required Validation After Every Change + +All agents working in this repository must run validation after each code change before marking work complete. + +Required steps: + +1. Run full repo build: + - `./build.ps1` +2. Run shared tests: + - `dotnet test ./tests/OpenClaw.Shared.Tests/OpenClaw.Shared.Tests.csproj --no-restore` +3. Run tray tests: + - `dotnet test ./tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj --no-restore` + +If a command fails: + +1. Fix the issue. +2. Re-run the failed command. +3. Re-run all required validation commands before completion. + +Notes: + +- If a build/test is blocked by an environmental lock (for example running executable locking output assemblies), stop/close the locking process and rerun. +- Do not claim completion without reporting validation results. diff --git a/README.md b/README.md index fb998cf..27290b2 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,13 @@ A Windows companion suite for [OpenClaw](https://openclaw.ai) - the AI-powered p ## Projects -This monorepo contains three projects: +This monorepo contains four projects: | Project | Description | |---------|-------------| | **OpenClaw.Tray.WinUI** | System tray application (WinUI 3) for quick access to OpenClaw | | **OpenClaw.Shared** | Shared gateway client library | +| **OpenClaw.Cli** | CLI validator for WebSocket connect/send/probe using tray settings | | **OpenClaw.CommandPalette** | PowerToys Command Palette extension | ## 🚀 Quick Start @@ -67,6 +68,24 @@ dotnet build src/OpenClaw.Tray.WinUI -r win-x64 -p:PackageMsix=true # x64 MSI .\src\OpenClaw.Tray.WinUI\bin\Debug\net10.0-windows10.0.19041.0\win-x64\OpenClaw.Tray.WinUI.exe # x64 ``` +### Run CLI WebSocket Validator + +Use the CLI to validate gateway connectivity and `chat.send` outside the tray UI. + +```powershell +# Show help +dotnet run --project src/OpenClaw.Cli -- --help + +# Use tray settings from %APPDATA%\OpenClawTray\settings.json and send one message +dotnet run --project src/OpenClaw.Cli -- --message "quick send validation" + +# Loop sends and also probe sessions/usage/nodes APIs +dotnet run --project src/OpenClaw.Cli -- --repeat 5 --delay-ms 1000 --probe-read --verbose + +# Override gateway URL/token for isolated testing +dotnet run --project src/OpenClaw.Cli -- --url ws://127.0.0.1:18789 --token "" --message "override test" +``` + ## 📦 OpenClaw.Tray (Molty) Modern Windows 11-style system tray companion that connects to your local OpenClaw gateway. @@ -87,6 +106,20 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl - ⚙️ **Settings** - Full configuration dialog - 🎯 **First-run experience** - Welcome dialog guides new users +#### Quick Send scope requirement + +Quick Send uses the gateway `chat.send` method and requires the operator device to have `operator.write` scope. + +If Quick Send fails with `missing scope: operator.write`, Molty now copies identity + remediation guidance to your clipboard, including: + +- operator role and `client.id` used by the tray app +- gateway-reported operator device id (if provided) +- currently granted scopes (if provided) + +For this specific error (`missing scope: operator.write`), the cause is an **operator token scope issue**. Update the token used by the tray app so it includes `operator.write`, then retry Quick Send. + +If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device approval** issue. Approve the tray device in gateway pairing approvals, reconnect, and retry. + ### Menu Sections - **Status** - Gateway connection status with click-to-view details - **Sessions** - Active agent sessions with preview and per-session controls diff --git a/build.ps1 b/build.ps1 index bd24b54..13cb7e8 100644 --- a/build.ps1 +++ b/build.ps1 @@ -6,7 +6,7 @@ Builds all projects, checks prerequisites, and provides clear guidance. .PARAMETER Project - Which project to build: All, Tray, WinUI, Shared, CommandPalette + Which project to build: All, Tray, WinUI, Shared, CommandPalette, Cli Default: All .PARAMETER Configuration @@ -23,7 +23,7 @@ #> param( - [ValidateSet("All", "Tray", "WinUI", "Shared", "CommandPalette")] + [ValidateSet("All", "Tray", "WinUI", "Shared", "CommandPalette", "Cli")] [string]$Project = "All", [ValidateSet("Debug", "Release")] @@ -187,12 +187,13 @@ function Build-Project($name, $path, $useRid = $false) { $projects = @{ "Shared" = @{ Path = "src/OpenClaw.Shared/OpenClaw.Shared.csproj"; UseRid = $false } + "Cli" = @{ Path = "src/OpenClaw.Cli/OpenClaw.Cli.csproj"; UseRid = $false } "Tray" = @{ Path = "src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj"; UseRid = $true } "WinUI" = @{ Path = "src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj"; UseRid = $true } "CommandPalette" = @{ Path = "src/OpenClaw.CommandPalette/OpenClaw.CommandPalette.csproj"; UseRid = $false } } -$toBuild = if ($Project -eq "All") { @("Shared", "WinUI") } else { @($Project) } +$toBuild = if ($Project -eq "All") { @("Shared", "Cli", "WinUI") } else { @($Project) } # Always build Shared first if building other projects if ($Project -ne "Shared" -and $Project -ne "All" -and $toBuild -notcontains "Shared") { diff --git a/diff.txt b/diff.txt new file mode 100644 index 0000000..12a438d --- /dev/null +++ b/diff.txt @@ -0,0 +1,12609 @@ +diff --git a/.gitignore b/.gitignore +index 6b3d49e..0c4e131 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -62,6 +62,7 @@ BenchmarkDotNet.Artifacts/ + project.lock.json + project.fragment.lock.json + artifacts/ ++.env + + # ASP.NET Scaffolding + ScaffoldingReadMe.txt +@@ -344,3 +345,9 @@ MigrationBackup/ + # Fody - auto-generated XML schema + FodyWeavers.xsd + Output/ ++ ++# Repo-local tool caches and workspace metadata ++.claude/ ++.dotnet-cli/ ++.playwright-cli/ ++output/playwright/ +diff --git a/README.md b/README.md +index b0c3e40..1caf343 100644 +--- a/README.md ++++ b/README.md +@@ -84,16 +84,18 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl + - ≡ƒÜÇ **Auto-start** - Launch with Windows + - ΓÜÖ∩╕Å **Settings** - Full configuration dialog + - ≡ƒÄ» **First-run experience** - Welcome dialog guides new users ++- Voice Mode **Voice Mode (new)** - Talk to your Claw via your Windows node + + ### Menu Sections + - **Status** - Gateway connection status with click-to-view details ++- **Voice** - Access to Voice controls + - **Sessions** - Active agent sessions with preview and per-session controls + - **Usage** - Provider/cost summary with quick jump to activity details + - **Channels** - Telegram/WhatsApp status with toggle control + - **Nodes** - Online/offline node inventory and copyable summary + - **Recent Activity** - Timestamped event stream for sessions, usage, nodes, and notifications + - **Actions** - Dashboard, Web Chat, Quick Send, Activity Stream, History +-- **Settings** - Configuration, auto-start, logs ++- **Settings** - Configuration, auto-start, logs, voice + + ### Mac Parity Status + +@@ -113,6 +115,7 @@ Comparing against [openclaw-menubar](https://github.com/magimetal/openclaw-menub + | Refresh | Γ£à | Γ£à | Auto-refresh on menu open | + | Launch at Login | Γ£à | Γ£à | | + | Notifications toggle | Γ£à | Γ£à | | ++| Voice Mode | Γ£à | ≡ƒƒí | Talk Mode implemented (half-duplex), WakeWord, Interrupt, etc. in progress + + ### Windows-Only Features + +@@ -246,6 +249,14 @@ OpenClaw registers the `openclaw://` URL scheme for automation and integration: + + Deep links work even when Molty is already running - they're forwarded via IPC. + ++### Voice Mode ++*contributed by NichUK and his colleagues @codex and @copilot* ++ ++Currently supports Talk Mode - Always on talk to your Claw! Wakeword and PTT modes coming soon ++- Uses internal Windows STT (cloud providers coming soon) ++- Windows/Minimax/Eleven Labs TTS voices ++ - Give your Claw a voice! ++ + ## ≡ƒôª OpenClaw.CommandPalette + + PowerToys Command Palette extension for quick OpenClaw access. +diff --git a/docs/VOICE-MODE.md b/docs/VOICE-MODE.md +new file mode 100644 +index 0000000..87d6011 +--- /dev/null ++++ b/docs/VOICE-MODE.md +@@ -0,0 +1,988 @@ ++# Voice Mode Architecture ++*Author: Nich Overend (NichUK@GitHub) - with @codex and @copilot* ++https://github.com/openclaw/openclaw-windows-node ++ ++ ++This document defines the voice subsystem for the Windows node only. It introduces the command surface, persisted settings schema, and minimum runtime boundaries needed to add Windows voice support without reshaping the existing node architecture. ++ ++## Goals ++ ++- Add a node-local voice mode with two activation modes: `VoiceWake` and `TalkMode` ++- Utilise minimal touch points to the existing app to reduce the potential for screw-ups ++- Use NanoWakeWord for wakeword detection on-device ++- Present the user-facing mode names as `Voice Wake` and `Talk Mode` ++- Keep STT/TTS provider selection configurable, with Windows implementations as the default built-in baseline ++- Implement `MiniMax` TTS and `ElevenLabs` TTS as required non-Windows providers after the Windows baseline ++- Make adding new voice providers an update to a Json catalog, rather than requiring code changes where possible ++- Reuse the existing node capability pattern instead of introducing a parallel control path ++- Ensure that the voice sub-system is extensible ++- Ensure that the voice sub-system is controllable from other applications ++ ++## Non-Goals ++ ++- True full-duplex or chunk-streaming audio transport between node and gateway ++- Subtantial changes to the existing project ++ ++## Design Position ++ ++The Windows node should own device-local audio concerns: ++ ++- microphone capture ++- wakeword detection ++- silence detection / utterance segmentation ++- speaker playback ++- device enumeration and persisted local settings ++ ++OpenClaw remains responsible for conversation/session routing and upstream voice orchestration. ++ ++This keeps the Windows node lean for the first implementation and avoids introducing provider-routing settings before they are needed. ++ ++## Visible Mode Names ++ ++The tray app now uses user-facing names (borrowed from the macOS app) rather than exposing the internal enum names directly: ++ ++| Internal Mode | Visible Name | Availability | ++|---|---|---| ++| `Off` | Off | available | ++| `VoiceWake` | Voice Wake | visible but disabled for now | ++| `TalkMode` | Talk Mode | available | ++ ++The contracts and persisted settings now use `VoiceWake` and `TalkMode` as well. ++ ++## Transport Boundary ++ ++`TalkMode` follows the current talk-mode style control flow: ++ ++- the node captures audio locally ++- local speech recognition turns that audio into transcript text on the active STT route ++- interim hypotheses are surfaced live, but only final `Medium` or `High` confidence recognizer results are submitted ++- if speech activity ends without any usable final transcript surviving, Talk Mode now clears the draft and gives a short local repeat prompt instead of silently doing nothing ++- the compact voice repeater window, when open, shows the live transcript draft plus local sent/received turns in a single scrolling surface ++- the tray chat window, when open, mirrors the live transcript draft into the compose box only ++- the finalized transcript is always sent to OpenClaw via direct `chat.send` on the voice mode target session, which is currently hardcoded in the tray app to `agent:main:main` ++- OpenClaw returns the assistant reply as normal chat output ++- the node performs local or remote TTS playback of that reply ++- assistant replies are queued locally and spoken sequentially, with a short (500 ms currently) pause between queued replies so overlapping responses are not lost ++- if a reply arrives after the normal 45-second wait timeout, the tray still accepts and speaks that late reply for a short bounded grace window (currently 120s) so slow upstream responses are not silently lost ++- assistant replies are currently accepted from either `agent:main:main` or the `main` alias so the tray can tolerate upstream session-key normalisation differences ++ ++To avoid obvious duplicate sends from the Windows recognizer, exact duplicate final transcripts are suppressed within a short 750 ms window. ++ ++The current Windows implementation uses a voice-local operator connection inside the tray app while node mode is active. That connection carries assistant chat events for `TalkMode`, while the recognized transcript is always sent through the tray app's direct `chat.send` path. ++ ++## Voice APIs ++ ++The Windows tray implementation now has two API layers: ++ ++- shared node-capability commands in `OpenClaw.Shared` ++- in-process tray interfaces used by the windows/forms ++ ++### Shared Capability Commands ++ ++The node capability command surface is: ++ ++- `voice.devices.list` ++- `voice.settings.get` ++- `voice.settings.set` ++- `voice.status.get` ++- `voice.start` ++- `voice.stop` ++- `voice.pause` ++- `voice.resume` ++- `voice.response.skip` ++ ++These commands are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs) and handled by [VoiceCapability.cs](../src/OpenClaw.Shared/Capabilities/VoiceCapability.cs). ++ ++`voice.settings.get` / `voice.settings.set` are the configuration API. ++ ++`voice.start` / `voice.stop` / `voice.pause` / `voice.resume` / `voice.response.skip` are the runtime control API. ++ ++### Status Surface ++ ++`VoiceStatusInfo` now carries the basic state needed by control surfaces: ++ ++- mode ++- runtime state ++- session key ++- input/output device ids ++- last wake / last utterance timestamps ++- pending reply count ++- whether a reply can currently be skipped ++- current reply preview ++- last error ++ ++### In-Process Tray Interfaces ++ ++The tray app also exposes in-process interfaces so its own windows do not need to bind directly to the concrete `VoiceService` implementation: ++ ++- `IVoiceConfigurationApi` ++ - get voice settings ++ - update voice settings ++ - list devices ++ - get provider catalog ++ - get/set provider configuration ++- `IVoiceRuntimeControlApi` ++ - get runtime status ++ - start / stop ++ - pause / resume ++ - skip current reply ++- `IVoiceRuntime` ++ - transcript draft and conversation events for chat integration ++ ++This now powers multiple tray-local voice surfaces, including the compact voice repeater window. ++ ++### Can the Settings Form Use This API? ++ ++Yes. The Settings form can use the configuration API cleanly. ++ ++The current tray implementation now uses the voice configuration interface for: ++ ++- provider catalog loading ++- device enumeration ++- applying updated voice settings / provider configuration on save ++ ++That means the settings UI is no longer hard-wired only to concrete `VoiceService` internals for its voice-specific behavior. ++ ++## Speech Output Implementation ++ ++In order to reduce output latency as much as possible, the current Windows implementation has made the following implementation decisions: ++ ++- the Windows `SpeechSynthesizer` is created once per `TalkMode` runtime and reused for subsequent replies ++ - Frankly, no one will probably use it, but everyone has it, so... ++- cloud TTS uses a shared static `HttpClient`, so HTTP/TLS connections can be reused across replies ++- cloud requests use `ResponseHeadersRead`, which lets the client observe response-header arrival without waiting for full buffering first ++- the tray app now logs per-reply synthesis timings for both Windows and cloud TTS paths so latency can be measured directly during testing ++ ++The main remaining gap is streaming playback from the first audio chunk. Best practice recommends chunked playback as soon as the first audio arrives, but the current implementation still waits for a complete playable stream before starting output (but not for long...): ++ ++- Windows `SpeechSynthesizer` is used through `SynthesizeTextToStreamAsync`, which returns a complete stream for playback ++- MiniMax now uses the provider catalog's WebSocket TTS contract, but the current player still waits for a complete playable stream before output starts ++- ElevenLabs now uses the provider catalog's `stream-input` WebSocket contract, but the current player still waits for a complete playable stream before output starts ++ ++So the current design minimizes avoidable setup and connection latency, but does not yet implement first-chunk playback streaming. This is however, planned for an early release (I'm working on it next). ++ ++## Tray Chat Integration Decision ++ ++Ideally Voice mode and typed chat should remain part of the same user-visible conversation in the web chat UI, however this proved difficult to achieve, as the gateway treated a message stream from the tray app seperately to that from the WebUI, even with the same session key. ++ ++The only way of achieving this vaguely reliably seemed to be to locally insert messages into the DOM, but as this was a brittle, hacky solution, it was disgarded. ++ ++### Chosen Approach ++ ++It was therefore decided to create a separate *voice repeater form* to serve as a message window for voice, as well as making the messages available via toasts. ++ ++The tray app keeps a tray-local interim transcript buffer for the current utterance, independent of whether any chat window or voice repeater form is open. ++ ++## Provider Selection ++ ++Voice settings now carry explicit provider ids for both STT and TTS: ++ ++- `Voice.SpeechToTextProviderId` ++- `Voice.TextToSpeechProviderId` ++ ++The built-in default for both is `windows`. ++ ++Runtime behavior in the current phase: ++ ++- `windows` is implemented for both STT and TTS ++- the `windows` STT route is a pure `Windows.Media.SpeechRecognition.SpeechRecognizer` path with no `AudioGraph` dependency ++- `windows` STT is currently treated as `half-duplex, non-streamed` ++- `http/ws` is now catalogued as a visible "coming soon" STT slot for generic streaming HTTP/WebSocket adapters ++- built-in catalog entries exist for both `minimax` and `elevenlabs` TTS ++- `minimax` defaults to `speech-2.8-turbo` and `English_MatureBoss` at present ++- `minimax` now uses a catalog-driven WebSocket contract for synchronous TTS ++- `elevenlabs` defaults to `eleven_multilingual_v2` and voice id `6aDn1KB0hjpdcocrUkmq (Tiffany)` for now ++- only currently usable providers are selectable in Settings ++- `sherpa-onnx` is visible but greyed out as a coming-soon local embedded route ++- unsupported providers fall back to Windows at runtime with a status warning ++ ++### Settings Surface Notes ++ ++The Settings panel now shows short inline descriptions for: ++ ++- the selected voice mode ++- the selected speech-to-text provider ++- the selected text-to-speech provider ++ ++Those provider descriptions are drawn directly from the provider catalog. ++ ++When `Windows Speech Recognition` is selected for STT, the Settings panel now forces both audio device pickers back to the system defaults and greys them out. That matches the current Windows route limitation and avoids advertising per-device microphone routing that does not exist on this route yet. ++ ++### Provider Catalog ++ ++The provider catalog now ships with the tray app as a bundled asset: ++ ++- `Assets\\voice-providers.json` ++ ++Example: ++ ++```json ++{ ++ "speechToTextProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Recognition", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed." ++ }, ++ { ++ "id": "http-ws", ++ "name": "http/ws", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": true, ++ "selectable": false, ++ "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming." ++ }, ++ ], ++ "textToSpeechProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Synthesis", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows text-to-speech playback." ++ }, ++ { ++ "id": "minimax", ++ "name": "MiniMax", ++ "runtime": "cloud", ++ "enabled": true, ++ "description": "Cloud TTS using the MiniMax WebSocket text-to-speech API.", ++ "settings": [ ++ { "key": "apiKey", "label": "API key", "secret": true }, ++ { ++ "key": "model", ++ "label": "Model", ++ "defaultValue": "speech-2.8-turbo", ++ "options": [ ++ "speech-2.5-turbo-preview", ++ "speech-02-turbo", ++ "speech-02-hd", ++ "speech-2.6-turbo", ++ "speech-2.6-hd", ++ "speech-2.8-turbo", ++ "speech-2.8-hd" ++ ] ++ }, ++ { "key": "voiceId", "label": "Voice ID", "defaultValue": "English_MatureBoss" }, ++ { ++ "key": "voiceSettingsJson", ++ "label": "Voice settings JSON", ++ "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", ++ "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }" ++ } ++ ], ++ "textToSpeechWebSocket": { ++ "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2", ++ "authenticationHeaderName": "Authorization", ++ "authenticationScheme": "Bearer", ++ "apiKeySettingKey": "apiKey", ++ "connectSuccessEventName": "connected_success", ++ "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }", ++ "startSuccessEventName": "task_started", ++ "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }", ++ "finishMessageTemplate": "{ \"event\": \"task_finish\" }", ++ "responseAudioMode": "hexJsonString", ++ "responseAudioJsonPath": "data.audio", ++ "responseStatusCodeJsonPath": "base_resp.status_code", ++ "responseStatusMessageJsonPath": "base_resp.status_msg", ++ "finalFlagJsonPath": "is_final", ++ "taskFailedEventName": "task_failed", ++ "successStatusValue": "0", ++ "outputContentType": "audio/mpeg" ++ } ++ }, ++ { ++ "id": "elevenlabs", ++ "name": "ElevenLabs", ++ "runtime": "cloud", ++ "enabled": true, ++ "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.", ++ "settings": [ ++ { "key": "apiKey", "label": "API key", "secret": true }, ++ { ++ "key": "model", ++ "label": "Model", ++ "defaultValue": "eleven_multilingual_v2", ++ "options": [ ++ "eleven_flash_v2_5", ++ "eleven_turbo_v2_5", ++ "eleven_multilingual_v2", ++ "eleven_monolingual_v1" ++ ] ++ }, ++ { "key": "voiceId", "label": "Voice ID", "defaultValue": "6aDn1KB0hjpdcocrUkmq", "placeholder": "Enter an ElevenLabs voice ID" }, ++ { ++ "key": "voiceSettingsJson", ++ "label": "Voice settings JSON", ++ "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", ++ "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }" ++ } ++ ], ++ "textToSpeechWebSocket": { ++ "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", ++ "authenticationHeaderName": "xi-api-key", ++ "authenticationScheme": "", ++ "apiKeySettingKey": "apiKey", ++ "connectSuccessEventName": "", ++ "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }", ++ "startSuccessEventName": "", ++ "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }", ++ "finishMessageTemplate": "{ \"text\": \"\" }", ++ "responseAudioMode": "base64JsonString", ++ "responseAudioJsonPath": "audio", ++ "finalFlagJsonPath": "isFinal", ++ "taskFailedEventName": "error", ++ "outputContentType": "audio/mpeg" ++ } ++ } ++ ] ++} ++``` ++ ++For cloud-backed TTS providers, the catalog carries either an HTTP or WebSocket request/response contract. That allows a new provider to be added by shipping an updated catalog file with the app, as long as it follows the same general templated transport approach. ++ ++This file defines provider metadata and transport contracts. It does not carry API keys, these are stored with the standard config. ++ ++### Local Provider Configuration ++ ++That means the current design is: ++ ++- local tray settings choose the preferred STT/TTS provider ids ++- provider API keys and editable values are stored in `%APPDATA%\\OpenClawTray\\settings.json` under `VoiceProviderConfiguration` ++- OpenClaw remains the conversation endpoint for `chat.send` ++- the shipped provider catalog remains metadata-only and must not contain secrets ++ ++This is an intentional short-term design choice so the Windows tray app can use cloud TTS providers without inventing a second catalog file for secrets. It can be revisited later if provider ownership is split differently. ++ ++Current configuration values are keyed by provider id. The built-in providers use: ++ ++- `apiKey` ++- `model` ++- `voiceId` ++- `voiceSettingsJson` ++ ++When the selected TTS provider in Settings is not `windows`, the tray app shows provider-specific fields in the configuration form so the user can enter or edit: ++ ++- API key ++- model ++- voice id ++- voice settings JSON ++ ++If a provider setting definition includes an `options` list, the settings UI renders that setting as a drop-down instead of a free-text field. That is how built-in cloud providers expose a provider-level choice plus a separate model choice without recompilation. ++ ++If a provider setting definition is marked as JSON, the value is inserted into the provider request template as a raw JSON fragment rather than a quoted string. That allows the provider catalog to define whether the user is entering: ++ ++- a bare object ++- or a full keyed fragment such as `"voice_setting": { ... }` ++ ++without hard-coding provider-specific wrapper keys into the runtime. ++ ++The current cloud TTS transports are: ++ ++- `MiniMax`: catalog-driven WebSocket synthesis ++- `ElevenLabs`: catalog-driven WebSocket synthesis (`stream-input`) ++ ++For `VoiceWake`, trigger words are gateway-owned global state. The Windows node should eventually consume the same shared trigger list and keep only a local enabled/disabled toggle plus device/runtime settings. ++ ++In-flight voice controls are supported, if supported by the chosen provider and provided in their format, although an abstraction/translation layer is being considered, to accompany support for OpenClaw voice directives in replies records. ++Pronunciation dictionaries are also only currently supported directly on the voice provider, however a centralised dictionary is possible, and a proposal is being considered. ++ ++## Command Surface ++ ++The voice subsystem is introduced as a new node capability category: `voice`. ++ ++### Commands ++ ++| Command | Purpose | Request Payload | Response Payload | ++|---|---|---|---| ++| `voice.devices.list` | Enumerate input/output audio devices | none | `VoiceAudioDeviceInfo[]` | ++| `voice.settings.get` | Return the effective voice configuration | none | `VoiceSettings` | ++| `voice.settings.set` | Update the voice configuration | `VoiceSettingsUpdateArgs` | `VoiceSettings` | ++| `voice.status.get` | Return runtime voice status | none | `VoiceStatusInfo` | ++| `voice.start` | Start the voice runtime with the supplied or persisted mode | `VoiceStartArgs` | `VoiceStatusInfo` | ++| `voice.stop` | Stop the voice runtime | `VoiceStopArgs` | `VoiceStatusInfo` | ++| `voice.pause` | Pause the active voice runtime | `VoicePauseArgs` | `VoiceStatusInfo` | ++| `voice.resume` | Resume a paused voice runtime | `VoiceResumeArgs` | `VoiceStatusInfo` | ++| `voice.response.skip` | Skip the currently spoken reply and advance the queue if another reply is pending | `VoiceSkipArgs` | `VoiceStatusInfo` | ++ ++### Payload Types ++ ++- `VoiceSettings` ++- `VoiceWakeSettings` ++- `TalkModeSettings` ++- `VoiceAudioDeviceInfo` ++- `VoiceStatusInfo` ++- `VoiceStartArgs` ++- `VoiceStopArgs` ++- `VoicePauseArgs` ++- `VoiceResumeArgs` ++- `VoiceSkipArgs` ++- `VoiceSettingsUpdateArgs` ++ ++These contracts are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs). ++ ++## Settings Schema ++ ++Voice settings are persisted as `SettingsData.Voice` in [SettingsData.cs](../src/OpenClaw.Shared/SettingsData.cs). ++Provider configuration is persisted as `SettingsData.VoiceProviderConfiguration` in the same local settings file. ++The compact repeater window state is persisted as `SettingsData.VoiceRepeaterWindow` in the same settings file. ++ ++The editable voice configuration now lives in the main Settings window. ++The tray `Voice Mode` window is a read-only runtime status/detail surface with a shortcut back into Settings. ++ ++### Voice Repeater Window Settings ++ ++The compact repeater persists its own local UI state in `SettingsData.VoiceRepeaterWindow`: ++ ++| Setting | Type | Default | Meaning | ++|---|---|---|---| ++| `VoiceRepeaterWindow.AutoScroll` | bool | `true` | Automatically scroll the transcript surface to the latest draft/reply | ++| `VoiceRepeaterWindow.FloatingEnabled` | bool | `true` | Keep the repeater floating above other windows | ++| `VoiceRepeaterWindow.TextSize` | double | `13` | Repeater transcript font size | ++| `VoiceRepeaterWindow.HasSavedPlacement` | bool | `false` | Whether a user placement has been persisted yet | ++| `VoiceRepeaterWindow.Width` | int? | `null` | Saved repeater width | ++| `VoiceRepeaterWindow.Height` | int? | `null` | Saved repeater height | ++| `VoiceRepeaterWindow.X` | int? | `null` | Saved repeater screen X coordinate | ++| `VoiceRepeaterWindow.Y` | int? | `null` | Saved repeater screen Y coordinate | ++ ++### Effective Schema ++ ++```json ++{ ++ "Voice": { ++ "Mode": "VoiceWake", ++ "Enabled": true, ++ "ShowRepeaterAtStartup": true, ++ "SpeechToTextProviderId": "windows", ++ "TextToSpeechProviderId": "windows", ++ "InputDeviceId": "default-mic", ++ "OutputDeviceId": "default-speaker", ++ "SampleRateHz": 16000, ++ "CaptureChunkMs": 80, ++ "BargeInEnabled": true, ++ "VoiceWake": { ++ "Engine": "NanoWakeWord", ++ "ModelId": "hey_openclaw", ++ "TriggerThreshold": 0.65, ++ "TriggerCooldownMs": 2000, ++ "PreRollMs": 1200, ++ "EndSilenceMs": 900 ++ }, ++ "TalkMode": { ++ "MinSpeechMs": 250, ++ "EndSilenceMs": 900, ++ "MaxUtteranceMs": 15000 ++ } ++ }, ++ "VoiceProviderConfiguration": { ++ "Providers": [ ++ { ++ "ProviderId": "minimax", ++ "Values": { ++ "apiKey": "", ++ "model": "speech-2.8-turbo", ++ "voiceId": "English_MatureBoss", ++ "voiceSettingsJson": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }" ++ } ++ }, ++ { ++ "ProviderId": "elevenlabs", ++ "Values": { ++ "apiKey": "", ++ "model": "eleven_multilingual_v2", ++ "voiceId": "voice-id", ++ "voiceSettingsJson": "\"voice_settings\": { \"stability\": 0.5, \"similarity_boost\": 0.8 }" ++ } ++ } ++ ] ++ } ++} ++``` ++ ++### Field Rationale ++ ++| Field | Purpose | ++|---|---| ++| `Mode` | Top-level activation mode: `Off`, `VoiceWake`, `TalkMode` | ++| `Enabled` | Global feature kill-switch independent of mode | ++| `ShowRepeaterAtStartup` | Opens the compact Voice Mode repeater automatically when the app starts with voice mode active | ++| `SpeechToTextProviderId` | Selected STT provider id from the local provider catalog | ++| `TextToSpeechProviderId` | Selected TTS provider id from the local provider catalog | ++| `InputDeviceId` / `OutputDeviceId` | Preferred audio device binding, with selected-speaker support implemented first | ++| `SampleRateHz` | Shared capture sample rate, fixed to a speech-friendly default | ++| `CaptureChunkMs` | Frame size for capture, VAD, and wakeword processing | ++| `BargeInEnabled` | Allows microphone capture while audio playback is active | ++| `VoiceWake.*` | NanoWakeWord and post-trigger utterance capture tuning | ++| `TalkMode.*` | Continuous-listening segmentation tuning | ++ ++### Complete Settings Definition ++ ++| Setting | Type | Default | Applies To | Meaning | ++|---|---|---|---|---| ++| `Voice.Mode` | enum | `Off` | all | Activation mode: `Off`, `VoiceWake`, `TalkMode` | ++| `Voice.Enabled` | bool | `false` | all | Master enable/disable flag for voice mode | ++| `Voice.ShowRepeaterAtStartup` | bool | `true` | all | If `true`, the compact Voice Mode repeater opens automatically when the app starts with voice mode active | ++| `Voice.SpeechToTextProviderId` | string | `windows` | all | Preferred speech-to-text provider id | ++| `Voice.TextToSpeechProviderId` | string | `windows` | all | Preferred text-to-speech provider id | ++| `Voice.InputDeviceId` | string? | `null` | all | Preferred microphone device id; `null` means system default | ++| `Voice.OutputDeviceId` | string? | `null` | all | Preferred speaker device id; `null` means system default | ++| `Voice.SampleRateHz` | int | `16000` | all | Internal capture rate used for wakeword, VAD, and utterance assembly | ++| `Voice.CaptureChunkMs` | int | `80` | all | Audio frame duration used by the capture loop | ++| `Voice.BargeInEnabled` | bool | `true` | all | If `true`, microphone capture may continue while response audio is playing | ++| `Voice.VoiceWake.Engine` | string | `NanoWakeWord` | voice wake | Voice Wake engine identifier | ++| `Voice.VoiceWake.ModelId` | string | `hey_openclaw` | voice wake | Voice Wake model/profile identifier | ++| `Voice.VoiceWake.TriggerThreshold` | float | `0.65` | voice wake | Minimum score required to trigger Voice Wake activation | ++| `Voice.VoiceWake.TriggerCooldownMs` | int | `2000` | voice wake | Minimum delay before another Voice Wake trigger is accepted | ++| `Voice.VoiceWake.PreRollMs` | int | `1200` | voice wake | Buffered audio retained before the trigger point | ++| `Voice.VoiceWake.EndSilenceMs` | int | `900` | voice wake | Silence timeout used to finalize the post-trigger utterance | ++| `Voice.TalkMode.MinSpeechMs` | int | `250` | talk mode | Minimum detected speech duration before an utterance is treated as real input | ++| `Voice.TalkMode.EndSilenceMs` | int | `900` | talk mode | Silence timeout used to finalize an utterance | ++| `Voice.TalkMode.MaxUtteranceMs` | int | `15000` | talk mode | Hard cap on utterance length before forced submission/finalization | ++| `VoiceProviderConfiguration.Providers[].ProviderId` | string | none | cloud providers | Provider id matching an `Assets\\voice-providers.json` entry | ++| `VoiceProviderConfiguration.Providers[].Values["apiKey"]` | string? | `null` | cloud providers | API key sent using the provider contract's configured auth header | ++| `VoiceProviderConfiguration.Providers[].Values["model"]` | string? | provider default | cloud providers | Model identifier inserted into the configured request template | ++| `VoiceProviderConfiguration.Providers[].Values["voiceId"]` | string? | provider default | cloud providers | Voice id inserted into the configured request template or URL | ++| `VoiceProviderConfiguration.Providers[].Values["voiceSettingsJson"]` | string? | provider default | cloud providers | Raw JSON fragment inserted into the configured request template; may be a keyed fragment like `"voice_setting": { ... }` | ++ ++At runtime today: ++ ++- `Voice.OutputDeviceId` is applied to Talk Mode playback through `MediaPlayer.AudioDevice` ++- `VoiceCaptureService` now runs an `AudioGraph` capture pipeline in parallel with Talk Mode and binds it to the selected or default microphone device ++- `Voice.InputDeviceId` is now used by that `AudioGraph` capture path, but transcript generation still uses the Windows default speech input path until the STT adapter migration is complete ++- Talk Mode only advertises `ListeningContinuously` after the capture graph has produced live frames and the recognizer warm-up window has elapsed, so the status acts as a real ΓÇ£you can start talking nowΓÇ¥ signal instead of a timer-only guess ++- recognizer recovery is now speech-triggered rather than silence-triggered: the Windows recognizer is only recycled when sustained capture speech is present but no recognition activity follows ++- when a recognizer session ends after real hypothesis activity but before a final result arrives, Talk Mode now promotes the last recent hypothesis and submits it instead of dropping the utterance ++- the speech-mismatch recovery watchdog is single-owner and only armed from capture speech, so a new recognition session does not spawn overlapping recovery loops ++- when the system default capture device changes and Talk Mode is using the default mic, the recognizer is rebuilt so device switches such as AirPods are picked up without a full app restart ++- explicit non-default microphone transcript generation is still pending the planned STT adapter migration ++ ++## Current Runtime Architecture ++ ++The current Windows implementation is still centred on `VoiceService`, with a few supporting seams around it: ++ ++- `VoiceCapability` ++ exposes shared `voice.*` commands to the node/gateway surface ++- `VoiceCaptureService` ++ owns the new `AudioGraph` capture backbone, selected/default microphone binding, and live signal detection ++- `VoiceService` ++ owns Talk Mode runtime state, recognizer/TTS integration, reply queuing, timeouts, gateway reply handling, and the transition layer between `AudioGraph` capture and the current recognizer-owned STT path ++- `VoiceChatCoordinator` ++ mirrors interim transcript drafts and conversation turns into attached tray windows without making any window part of the transport path ++- `OpenClawGatewayClient` ++ carries direct `chat.send`, final chat events, and the `sessions.preview` fallback path for bare final markers ++- `WebChatWindow` ++ mirrors live transcript drafts into the WebChat compose box ++- `VoiceRepeaterWindow` ++ is the compact local transcript/reply/control surface for Talk Mode ++ ++### Current End-to-End Talk Mode ++ ++```mermaid ++flowchart LR ++ A["User speech"] --> B["VoiceCaptureService
AudioGraph on selected/default mic"] ++ A --> C["Windows SpeechRecognizer
continuous dictation on current default mic"] ++ ++ B --> D["FrameCaptured / SignalDetected"] ++ D --> E["VoiceService
capture-backed health + device state"] ++ ++ C --> F["HypothesisGenerated
interim text"] ++ F --> G["VoiceService
draft event"] ++ G --> H["VoiceChatCoordinator"] ++ H --> I["WebChatWindow
compose-box mirror only"] ++ H --> I2["VoiceRepeaterWindow
compact local draft surface"] ++ ++ C --> J["ResultGenerated
final Medium/High text"] ++ J --> K["VoiceService
duplicate guard + late hypothesis promotion"] ++ K --> L["Stop recognition session"] ++ L --> M["OpenClawGatewayClient.SendChatMessageAsync
direct chat.send(agent:main:main, transcript)"] ++ M --> N["OpenClaw / session pipeline"] ++ K --> H2["VoiceChatCoordinator
outgoing turn event"] ++ H2 --> I2 ++ N --> O["Chat final event"] ++ O --> P{"assistant text present?"} ++ P -- "yes" --> Q["assistant text"] ++ P -- "no" --> R["sessions.preview fallback
with stale-preview retry guard"] ++ R --> Q ++ Q --> H3["VoiceChatCoordinator
incoming turn event"] ++ H3 --> I2 ++ ++ Q --> S["VoiceService reply queue"] ++ S --> T{"TTS provider"} ++ T -- "windows" --> U["SpeechSynthesizer"] ++ T -- "cloud" --> V["VoiceCloudTextToSpeechClient
MiniMax websocket or other provider"] ++ U --> W["Complete playable stream"] ++ V --> W ++ W --> X["MediaPlayer
selected OutputDeviceId if set"] ++ X --> Y["Speaker / headset output"] ++ Y --> Z["Resume recognition when queue drains"] ++``` ++ ++### Current Processing Stages ++ ++| Stage | Component | Input | Output | ++|---|---|---|---| ++| 1 | `VoiceCaptureService` | selected/default microphone device | continuous frame and signal events from `AudioGraph` | ++| 2 | `SpeechRecognizer` | Windows default speech-input path | interim/final transcript text | ++| 3 | `VoiceService` | capture signal + final transcript text | health/restart decisions, de-duplicated transcript, runtime state changes | ++| 4 | `VoiceChatCoordinator` | draft and conversation-turn events | mirrored draft for WebChat plus compact local transcript/reply updates | ++| 5 | `OpenClawGatewayClient` | transcript text + session key | `chat.send` request + assistant reply events | ++| 6 | `OpenClawGatewayClient` preview fallback | bare final chat marker | assistant preview text, guarded against stale replay | ++| 7 | `VoiceService` reply queue | assistant reply text | ordered reply playback work | ++| 8 | `VoiceCloudTextToSpeechClient` / `SpeechSynthesizer` | assistant reply text | complete playable audio stream | ++| 9 | `MediaPlayer` | complete playable audio stream | rendered audio on default or selected speaker | ++ ++## Planned AudioGraph Input Architecture ++ ++The next input-phase refactor will move microphone ownership away from `SpeechRecognizer` and into an explicit capture pipeline built around `AudioGraph`. ++ ++The purpose of that change is to unlock: ++ ++- true selected non-default microphone support ++- streaming rather than utterance-owned capture ++- a proper ring buffer and VAD pipeline ++- future non-Windows and streaming STT providers ++- future barge-in / full-duplex work ++ ++### Target Input Stack ++ ++```mermaid ++flowchart TD ++ A["Selected microphone device id
or system default mic"] --> B["VoiceCaptureService
AudioGraph input node"] ++ B --> C["PCM frame stream
fixed chunk duration"] ++ C --> D["Ring buffer
bounded pre-roll"] ++ C --> E["VoiceActivityDetector"] ++ C --> F["VoiceWake engine
later"] ++ C --> G["SpeechToText adapter"] ++ E --> H["UtteranceAssembler
for non-streaming STT adapters"] ++ D --> H ++ H --> G ++ G --> I["Transcript events
interim + final"] ++ I --> J["VoiceService / runtime controller"] ++ J --> K["OpenClawGatewayClient
chat.send + reply events"] ++``` ++ ++### Proposed Seams ++ ++The target split should look like this: ++ ++- `VoiceCaptureService` ++ - owns `AudioGraph` ++ - binds to an explicit input device id when one is selected ++ - emits continuous PCM frames ++- `IVoiceActivityDetector` ++ - emits speech / silence transitions from frame data ++- `IUtteranceAssembler` ++ - builds bounded utterances from frames for non-streaming STT backends ++- `ISpeechToTextAdapter` ++ - consumes either live frames or completed utterances ++ - emits interim and final transcript events ++- `VoiceService` ++ - remains the runtime orchestrator rather than the owner of low-level capture ++ ++## Selected-Device Roadmap ++ ++The current selected-device position is now: ++ ++- selected non-default speaker: implemented ++- selected/default microphone binding for `SpeechRecognizer` capture: implemented ++- selected non-default microphone for actual transcript generation: not implemented yet (requires `AudioGraph` support) ++ ++## Control Flow ++ ++```mermaid ++sequenceDiagram ++ participant Gateway as Gateway / Operator ++ participant VoiceCap as VoiceCapability ++ participant Coord as VoiceService ++ participant Store as SettingsData.Voice ++ ++ Gateway->>VoiceCap: voice.settings.get ++ VoiceCap-->>Gateway: VoiceSettings ++ ++ Gateway->>VoiceCap: voice.settings.set(settings, persist=true) ++ VoiceCap->>Store: save VoiceSettings ++ VoiceCap-->>Gateway: VoiceSettings ++ ++Gateway->>VoiceCap: voice.start(mode=TalkMode, sessionKey=...) ++ VoiceCap->>Coord: Start(VoiceStartArgs) ++Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.status.get ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.pause(reason=...) ++ VoiceCap->>Coord: Pause() ++ Coord-->>VoiceCap: VoiceStatusInfo(state=Paused) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.resume(reason=...) ++ VoiceCap->>Coord: Resume() ++ Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.response.skip(reason=...) ++ VoiceCap->>Coord: SkipCurrentReply() ++ Coord-->>VoiceCap: VoiceStatusInfo ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.stop(reason=...) ++ VoiceCap->>Coord: Stop() ++ Coord-->>VoiceCap: VoiceStatusInfo(state=Stopped) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++``` ++ ++## Integration Boundaries ++ ++### Existing Components Reused ++ ++- `NodeService` remains the capability registration and lifecycle owner ++- `SettingsData` remains the persisted JSON settings model ++- `WindowsNodeClient` remains the gateway/node transport ++- existing node capability registration remains the integration pattern ++- current request/response transport remains the v1 control plane ++ ++### Supporting Components In Current Use ++ ++- `VoiceCapability` in `OpenClaw.Shared.Capabilities` ++- `VoiceCaptureService` in `OpenClaw.Tray.WinUI.Services` ++- `VoiceChatCoordinator` in `OpenClaw.Tray.WinUI.Services` ++- `VoiceRepeaterWindow` in `OpenClaw.Tray.WinUI.Windows` ++- `WebChatWindow` in `OpenClaw.Tray.WinUI.Windows` ++ ++### Components Still Expected Later ++ ++- `VoiceWakeService` in `OpenClaw.Tray.WinUI.Services` ++- a dedicated `VoicePlaybackService` seam when playback is split out of `VoiceService` ++ ++## Parity with macOS Node ++ ++Status values used below: ++ ++- `Supported` ++- `Partial` ++- `NotSupported (planned)` ++- `Exceeded*` ++ ++| macOS feature | Current Windows state | Notes | ++|---|---|---| ++| Talk Mode continuous loop (`listen -> chat.send(main) -> wait -> speak`) | `Supported` | Windows Talk Mode uses direct `chat.send` on the tray voice target session (`agent:main:main` today, while still accepting the `main` alias on replies) and loops back to listening after reply playback. | ++| Talk Mode sends after a short silence window | `Supported` | The current runtime finalizes on recognition pause and uses configurable Talk Mode silence settings. | ++| Talk Mode visible phase transitions (`Listening -> Thinking -> Speaking`) | `Partial` | Runtime states, tray icon changes, and the compact voice repeater window exist, but there is no always-visible overlay yet. | ++| Talk Mode always-on overlay with click-to-stop / click-X controls | `NotSupported (planned)` | Windows currently has a tray icon, a manually-opened compact repeater window, and WebChat draft mirroring, but no always-on overlay surface. | ++| Talk Mode writes replies into WebChat the same way typed chat does | `Partial` | Replies appear in WebChat through normal session updates, but Talk Mode uses direct send rather than a same-as-typing transport path. | ++| Talk Mode interrupt-on-speech / barge-in | `NotSupported (planned)` | Windows is still half-duplex during reply playback. | ++| Talk Mode voice directives in replies | `NotSupported (planned)` | Windows does not yet parse or apply the JSON voice directive line described in the Talk Mode docs. | ++| Talk Mode true streaming TTS playback | `NotSupported (planned)` | MiniMax uses WebSocket transport, but playback still waits for a complete playable stream. | ++| Talk Mode cloud TTS provider flexibility | `Exceeded` | Windows already supports Windows built-in TTS plus catalog-driven cloud providers rather than being limited to a single provider path. This exceeds the documented macOS baseline on provider flexibility, but not yet on true streaming playback latency because incremental playback is still pending. | ++| Voice Wake wake-word runtime | `NotSupported (planned)` | `VoiceWake` remains a documented target mode, but there is no active wake-word runtime yet. | ++| Voice Wake push-to-talk capture | `NotSupported (planned)` | There is no Windows push-to-talk path yet. | ++| Voice Wake overlay with committed / volatile transcript states | `NotSupported (planned)` | No Voice Wake overlay exists on Windows yet. | ++| Voice Wake restart invariants when UI is dismissed | `NotSupported (planned)` | The macOS overlay-dismiss resilience behavior has no Windows equivalent yet because the overlay/runtime does not exist. | ++| Voice Wake forwarding to the active gateway / agent | `NotSupported (planned)` | Forwarding semantics are only implemented for Talk Mode today. | ++| Voice Wake machine-hint transcript prefixing | `NotSupported (planned)` | Windows does not currently prepend a machine hint on forwarded wake transcripts. | ++| Voice Wake mic picker, live level meter, trigger-word table, and tester | `NotSupported (planned)` | Windows has general voice settings and device lists, but not the Voice Wake-specific settings surface from macOS. | ++| Voice mic device selection | `Partial` | When `Windows Speech Recognition` is selected, Settings now locks both audio device pickers to the system defaults. Explicit per-device transcription routing remains a future AudioGraph/streaming-route feature. | ++| Voice Wake send / trigger chimes | `NotSupported (planned)` | Windows currently has no configurable trigger/send sounds. | ++ ++## Feature List - Backlog - Not in Order, except maybe the first two ;) ++ ++### Story: Streaming STT Capture Pipeline ++ ++Implement `AudioGraph` to create an extensible streaming speech input pipeline, rather than the current self-contained `Windows.Media.SpeechRecognizer` pipeline. ++ ++This will allow us to mix/match components, and reduce latency. ++ ++- Will support Cloud or Local http/ws providers (including Microsoft Foundry Local/OpenAI Whisper/etc) ++- Will support Embedded sherpa-onnx engine for user-defined/downloaded models ++- This will enable selection of best of class model for required use/language ++ ++### Story: True streaming TTS playback ++ ++Start speaking assistant replies from the first usable audio chunk instead of waiting for a complete playable stream. ++ ++Notes: ++ ++- the current implementation uses WebSocket transport for MiniMax, but still buffers the entire audio response before playback begins ++- `firstChunk=...ms` in the log is currently provider-chunk arrival time, not actual speech-start time ++- implement a playback path that can consume incremental audio data as it arrives from the provider ++- the provider catalog contract should remain transport-driven and provider-agnostic, so streaming behavior should be expressed through the existing TTS contract model rather than hard-coded for MiniMax ++- preserve the existing queued reply behavior, skip support, and late-reply handling while switching playback to progressive output ++- add timing logs that separate `firstChunk`, `playbackStart`, and `playbackEnd` so latency improvements are measurable ++ ++### Story: True selected-microphone transcription support ++ ++Make actual STT transcription follow the selected microphone device, not just the default device. ++ ++- depends on `AudioGraph` support ++ ++ ++### Story: Talk Mode overlay and visible phase parity ++ ++Add a Talk Mode overlay that makes `Listening`, `Thinking`, and `Speaking` visible to the user in the same way the macOS experience does. Probably via the current voice mode form. I haven't actually seen the macOS version, so not sure how they do it. ++ ++ ++### Story: Talk Mode overlay controls ++ ++Add explicit Talk Mode overlay controls for stopping speech playback and exiting Talk Mode. ++ ++Notes: ++ ++- macOS exposes click-to-stop and click-to-exit controls directly on the overlay ++- Windows currently requires tray or settings interaction instead ++- this should plug into the shared runtime control API rather than directly manipulating `VoiceService` ++ ++ ++### Story: Voice directives in replies ++ ++Support the Talk Mode reply-prefix JSON directive described in the OpenClaw docs. ++ ++Notes: ++ ++- parse only the first non-empty reply line ++- strip the directive before playback ++- support per-reply `once: true` and persistent default updates ++- supported keys should at least include voice, model, and the documented voice-shaping parameters ++- provider-specific validation should happen through the provider contract layer where possible ++ ++### Story: Foundry Local STT provider ++ ++Implement the AudioGraph-fed streaming STT adapter for Foundry Local. ++ ++Notes: ++ ++- provider metadata now lives in the provider catalog, but it should stay disabled in settings until the runtime adapter exists ++- this route should use the shared streaming STT path rather than the Windows.Media recognizer path ++- endpoint and model selection should come from the provider catalog settings contract ++ ++### Story: OpenAI Whisper STT provider ++ ++Implement the AudioGraph-fed streaming STT adapter for OpenAI Whisper transcription. ++ ++Notes: ++ ++- this should be catalog-driven and disabled in settings until the adapter is production-ready ++- the initial implementation only needs the basic transcription path, not translation or diarization ++- API key and model configuration should come from the provider catalog ++ ++### Story: ElevenLabs Speech to Text provider ++ ++Implement the AudioGraph-fed streaming STT adapter for ElevenLabs speech-to-text. ++ ++Notes: ++ ++- keep it catalog-driven and disabled in settings until the runtime path is implemented ++- match the same route abstraction used by the other non-Windows STT providers ++- any provider-specific partial/final transcript semantics should be normalized in the adapter layer ++ ++### Story: Azure AI Speech STT provider ++ ++Implement the AudioGraph-fed streaming STT adapter for Azure AI Speech. ++ ++Notes: ++ ++- use the official Azure AI Speech naming in settings and docs rather than an internal "Foundry Azure STT" label ++- keep the provider catalog entry disabled until the adapter is functional end to end ++- endpoint and credential handling should come from the provider settings contract ++ ++### Story: sherpa-onnx embedded STT provider ++ ++Implement the local embedded sherpa-onnx STT route for user-supplied model bundles. ++ ++Notes: ++ ++- keep this visible but greyed out in settings until the embedded runtime is implemented ++- the user should be able to choose their own downloaded model bundle and language-appropriate package ++- model lifecycle, validation, and error reporting should be handled in the embedded adapter rather than in the Windows.Media route ++ ++ ++### Story: Full-duplex / barge-in Talk Mode ++ ++Allow the node to keep listening while it is speaking, so the user can interrupt or interleave speech without waiting for reply playback to finish. ++ ++Notes: ++ ++- the current Windows implementation is half-duplex: recognition is stopped or ignored while a reply is being spoken ++- practical requirements are likely to include: ++ - microphone capture that can remain active during playback ++ - acoustic echo cancellation / echo suppression ++ - barge-in detection and playback interruption rules ++ - a policy for whether interrupt speech cancels the current reply or queues behind it ++ - additional runtime control/status so the UI can show when barge-in is armed ++- this should be treated as a separate engineering phase, not a small extension of the current Talk Mode runtime ++ ++### Story: Voice Wake wake-word runtime ++ ++Implement the actual Windows Voice Wake runtime. ++ ++Notes: ++ ++- this should cover wake-word listening, trigger detection, post-trigger capture, silence finalization, hard-stop protection, and debounce between sessions ++- the runtime should restart cleanly after send and should remain armed whenever Voice Wake is enabled and permissions are available ++- the implementation should be based on the planned `AudioGraph` capture pipeline rather than a second unrelated microphone stack ++ ++### Story: Voice Wake push-to-talk ++ ++Implement a Windows push-to-talk capture path alongside wake-word activation. ++ ++Notes: ++ ++- this should support press-to-capture, release-to-finalize semantics ++- it should pause the wake runtime while push-to-talk capture is active, then resume it cleanly afterward ++- Windows-specific hotkey and permissions behavior should be documented explicitly once chosen ++ ++### Story: Voice Wake settings parity ++ ++Add the user-facing Voice Wake settings surface that exists on macOS. ++ ++Notes: ++ ++- include language and mic pickers ++- include a live level meter ++- include trigger-word editing or table management ++- include a local-only tester that does not forward ++- preserve the chosen mic if it disconnects, surface a disconnected hint, and fall back to the system default until it returns ++ ++### Story: Voice Wake sounds and chimes ++ ++Add configurable trigger and send sounds for Voice Wake. ++ ++Notes: ++ ++- trigger and send events should be independently configurable ++- support `No Sound` ++- keep the sound implementation distinct from assistant reply playback ++ ++### Story: Voice Wake forwarding semantics ++ ++Implement the documented Voice Wake forwarding behavior. ++ ++Notes: ++ ++- forwarded transcripts should go to the active gateway / agent path ++- reply delivery and logging behavior should match the rest of the node session model ++- the forwarding path should be resilient even when UI surfaces are closed ++ ++### Story: Voice Wake machine-hint prefixing ++ ++Implement the documented transcript prefixing / machine-hint behavior for forwarded Voice Wake utterances. ++ ++Notes: ++ ++- the prefixing rule should be explicit and testable ++- both wake-word and push-to-talk paths should share the same forwarding helper ++ ++### Story: Voice Wake trigger tuning and pause semantics ++ ++Implement the documented Voice Wake trigger-gap, silence-window, hard-stop, and debounce semantics. ++ ++Notes: ++ ++- include the wake-word gap behavior before command capture begins ++- support distinct silence windows for trigger-only vs flowing speech cases ++- include a hard maximum capture duration ++- expose the tuning through voice settings rather than hard-coded constants alone ++ +diff --git a/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs +new file mode 100644 +index 0000000..728b8fd +--- /dev/null ++++ b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs +@@ -0,0 +1,248 @@ ++using System; ++using System.Collections.Generic; ++using System.Text.Json; ++using System.Threading.Tasks; ++ ++namespace OpenClaw.Shared.Capabilities; ++ ++public class VoiceCapability : NodeCapabilityBase ++{ ++ private const string LegacySkipCommand = "voice.skip"; ++ ++ private static readonly JsonSerializerOptions s_jsonOptions = new() ++ { ++ PropertyNameCaseInsensitive = true ++ }; ++ ++ public override string Category => "voice"; ++ ++ public override IReadOnlyList Commands => VoiceCommands.All; ++ ++ public event Func>? ListDevicesRequested; ++ public event Func>? SettingsRequested; ++ public event Func>? SettingsUpdateRequested; ++ public event Func>? StatusRequested; ++ public event Func>? StartRequested; ++ public event Func>? StopRequested; ++ public event Func>? PauseRequested; ++ public event Func>? ResumeRequested; ++ public event Func>? SkipRequested; ++ ++ public VoiceCapability(IOpenClawLogger logger) : base(logger) ++ { ++ } ++ ++ public override async Task ExecuteAsync(NodeInvokeRequest request) ++ { ++ return request.Command switch ++ { ++ VoiceCommands.ListDevices => await HandleListDevicesAsync(), ++ VoiceCommands.GetSettings => await HandleGetSettingsAsync(), ++ VoiceCommands.SetSettings => await HandleSetSettingsAsync(request), ++ VoiceCommands.GetStatus => await HandleGetStatusAsync(), ++ VoiceCommands.Start => await HandleStartAsync(request), ++ VoiceCommands.Stop => await HandleStopAsync(request), ++ VoiceCommands.Pause => await HandlePauseAsync(request), ++ VoiceCommands.Resume => await HandleResumeAsync(request), ++ VoiceCommands.Skip or LegacySkipCommand => await HandleSkipAsync(request), ++ _ => Error($"Unknown command: {request.Command}") ++ }; ++ } ++ ++ private async Task HandleListDevicesAsync() ++ { ++ Logger.Info(VoiceCommands.ListDevices); ++ ++ if (ListDevicesRequested == null) ++ return Error("Voice device enumeration not available"); ++ ++ try ++ { ++ return Success(await ListDevicesRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice device enumeration failed", ex); ++ return Error($"Device enumeration failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleGetSettingsAsync() ++ { ++ Logger.Info(VoiceCommands.GetSettings); ++ ++ if (SettingsRequested == null) ++ return Error("Voice settings not available"); ++ ++ try ++ { ++ return Success(await SettingsRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice settings get failed", ex); ++ return Error($"Get settings failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleSetSettingsAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.SetSettings); ++ ++ if (SettingsUpdateRequested == null) ++ return Error("Voice settings update not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ VoiceSettingsUpdateArgs? update = null; ++ if (request.Args.ValueKind == JsonValueKind.Object && ++ request.Args.TryGetProperty("update", out var updateEl)) ++ { ++ update = JsonSerializer.Deserialize(updateEl.GetRawText(), s_jsonOptions); ++ } ++ ++ update ??= JsonSerializer.Deserialize(rawArgs, s_jsonOptions); ++ ++ if (update == null) ++ return Error("Missing update payload"); ++ ++ return Success(await SettingsUpdateRequested(update)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice settings update failed", ex); ++ return Error($"Set settings failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleGetStatusAsync() ++ { ++ Logger.Info(VoiceCommands.GetStatus); ++ ++ if (StatusRequested == null) ++ return Error("Voice status not available"); ++ ++ try ++ { ++ return Success(await StatusRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice status get failed", ex); ++ return Error($"Get status failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleStartAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Start); ++ ++ if (StartRequested == null) ++ return Error("Voice start not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStartArgs(); ++ return Success(await StartRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice start failed", ex); ++ return Error($"Start failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleStopAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Stop); ++ ++ if (StopRequested == null) ++ return Error("Voice stop not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStopArgs(); ++ return Success(await StopRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice stop failed", ex); ++ return Error($"Stop failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandlePauseAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Pause); ++ ++ if (PauseRequested == null) ++ return Error("Voice pause not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoicePauseArgs(); ++ return Success(await PauseRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice pause failed", ex); ++ return Error($"Pause failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleResumeAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Resume); ++ ++ if (ResumeRequested == null) ++ return Error("Voice resume not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceResumeArgs(); ++ return Success(await ResumeRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice resume failed", ex); ++ return Error($"Resume failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleSkipAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Skip); ++ ++ if (SkipRequested == null) ++ return Error("Voice skip not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceSkipArgs(); ++ return Success(await SkipRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice skip failed", ex); ++ return Error($"Skip failed: {ex.Message}"); ++ } ++ } ++} +diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs +index 725a66b..5b56953 100644 +--- a/src/OpenClaw.Shared/Models.cs ++++ b/src/OpenClaw.Shared/Models.cs +@@ -86,6 +86,14 @@ public class OpenClawNotification + public string[]? Tags { get; set; } // free-form routing tags + } + ++public class ChatMessageEventArgs : EventArgs ++{ ++ public string SessionKey { get; set; } = "main"; ++ public string Role { get; set; } = ""; ++ public string Message { get; set; } = ""; ++ public bool IsFinal { get; set; } ++} ++ + /// + /// A user-defined notification categorization rule. + /// +diff --git a/src/OpenClaw.Shared/OpenClawGatewayClient.cs b/src/OpenClaw.Shared/OpenClawGatewayClient.cs +index 0e21836..80623c7 100644 +--- a/src/OpenClaw.Shared/OpenClawGatewayClient.cs ++++ b/src/OpenClaw.Shared/OpenClawGatewayClient.cs +@@ -15,13 +15,24 @@ public class OpenClawGatewayClient : WebSocketClientBase + private GatewayUsageStatusInfo? _usageStatus; + private GatewayCostUsageInfo? _usageCost; + private readonly Dictionary _pendingRequestMethods = new(); ++ private readonly Dictionary _pendingChatPreviewSessionKeys = new(); ++ private readonly Dictionary _lastAssistantMessagesBySession = new(); + private readonly object _pendingRequestLock = new(); ++ private readonly object _pendingChatPreviewLock = new(); + private readonly object _sessionsLock = new(); + private readonly object _nodesLock = new(); + private bool _usageStatusUnsupported; + private bool _usageCostUnsupported; + private bool _sessionPreviewUnsupported; + private bool _nodeListUnsupported; ++ private string _defaultChatSessionKey = DefaultChatSessionKey; ++ ++ private const string DefaultChatSessionKey = "main"; ++ private sealed class PendingChatPreviewState ++ { ++ public string? LastKnownAssistantText { get; init; } ++ public int AttemptCount { get; set; } ++ } + + private void ResetUnsupportedMethodFlags() + { +@@ -49,15 +60,18 @@ protected override Task OnConnectedAsync() + protected override void OnDisconnected() + { + ClearPendingRequests(); ++ ClearPendingChatPreviewSessions(); + } + + protected override void OnDisposing() + { + ClearPendingRequests(); ++ ClearPendingChatPreviewSessions(); + } + + // Events + public event EventHandler? NotificationReceived; ++ public event EventHandler? ChatMessageReceived; + public event EventHandler? ActivityChanged; + public event EventHandler? ChannelHealthUpdated; + public event EventHandler? SessionsUpdated; +@@ -118,19 +132,29 @@ public async Task CheckHealthAsync() + } + } + +- public async Task SendChatMessageAsync(string message) ++ public async Task SendChatMessageAsync(string message, string? sessionKey = null, string? idempotencyKey = null) + { + if (!IsConnected) + throw new InvalidOperationException("Gateway connection is not open"); + +- var req = new ++ var requestId = Guid.NewGuid().ToString(); ++ var resolvedSessionKey = ResolveChatSessionKey(sessionKey); ++ var resolvedIdempotencyKey = string.IsNullOrWhiteSpace(idempotencyKey) ++ ? Guid.NewGuid().ToString() ++ : idempotencyKey; ++ var parameters = BuildChatSendParameters(message, resolvedSessionKey, resolvedIdempotencyKey); ++ ++ TrackPendingRequest(requestId, "chat.send"); ++ try + { +- type = "req", +- id = Guid.NewGuid().ToString(), +- method = "chat.send", +- @params = new { message } +- }; +- await SendRawAsync(JsonSerializer.Serialize(req)); ++ await SendRawAsync(SerializeRequest(requestId, "chat.send", parameters)); ++ } ++ catch ++ { ++ RemovePendingRequest(requestId); ++ throw; ++ } ++ + _logger.Info($"Sent chat message ({message.Length} chars)"); + } + +@@ -281,37 +305,41 @@ public async Task StopChannelAsync(string channelName) + + private async Task SendConnectMessageAsync(string? nonce = null) + { +- // Use "cli" client ID for native apps - no browser security checks + var msg = new + { + type = "req", + id = Guid.NewGuid().ToString(), + method = "connect", +- @params = new +- { +- minProtocol = 3, +- maxProtocol = 3, +- client = new +- { +- id = "cli", // Native client ID +- version = "1.0.0", +- platform = "windows", +- mode = "cli", +- displayName = "OpenClaw Windows Tray" +- }, +- role = "operator", +- scopes = new[] { "operator.admin", "operator.approvals", "operator.pairing" }, +- caps = Array.Empty(), +- commands = Array.Empty(), +- permissions = new { }, +- auth = new { token = _token }, +- locale = "en-US", +- userAgent = "openclaw-windows-tray/1.0.0" +- } ++ @params = BuildConnectParameters() + }; + await SendRawAsync(JsonSerializer.Serialize(msg)); + } + ++ private object BuildConnectParameters() ++ { ++ return new ++ { ++ minProtocol = 3, ++ maxProtocol = 3, ++ client = new ++ { ++ id = "cli", ++ version = "1.0.0", ++ platform = "windows", ++ mode = "cli", ++ displayName = "OpenClaw Windows Tray" ++ }, ++ role = "operator", ++ scopes = new[] { "operator.read", "operator.write", "operator.admin", "operator.approvals", "operator.pairing" }, ++ caps = Array.Empty(), ++ commands = Array.Empty(), ++ permissions = new { }, ++ auth = new { token = _token }, ++ locale = "en-US", ++ userAgent = "openclaw-windows-tray/1.0.0" ++ }; ++ } ++ + private async Task SendTrackedRequestAsync(string method, object? parameters = null) + { + if (!IsConnected) return; +@@ -456,6 +484,7 @@ private void HandleResponse(JsonElement root) + // Handle hello-ok + if (payload.TryGetProperty("type", out var t) && t.GetString() == "hello-ok") + { ++ UpdateDefaultChatSessionKeyFromHello(payload); + _logger.Info("Handshake complete (hello-ok)"); + RaiseStatusChanged(ConnectionStatus.Connected); + +@@ -799,13 +828,18 @@ private void HandleToolEvent(JsonElement payload, string sessionKey, bool isMain + private void HandleChatEvent(JsonElement root) + { + _logger.Debug($"Chat event received: {root.GetRawText().Substring(0, Math.Min(200, root.GetRawText().Length))}"); +- ++ + if (!root.TryGetProperty("payload", out var payload)) return; ++ var sessionKey = NormalizeChatSessionKey(TryGetSessionKey(root, payload)); ++ var isFinal = !payload.TryGetProperty("state", out var state) || ++ string.Equals(state.GetString(), "final", StringComparison.OrdinalIgnoreCase); ++ var emittedAssistantText = false; + + // Try new format: payload.message.role + payload.message.content[].text + if (payload.TryGetProperty("message", out var message)) + { +- if (message.TryGetProperty("role", out var role) && role.GetString() == "assistant") ++ var roleName = GetString(message, "role"); ++ if (roleName == "assistant") + { + // Extract text from content array + if (message.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) +@@ -816,11 +850,11 @@ private void HandleChatEvent(JsonElement root) + item.TryGetProperty("text", out var textProp)) + { + var text = textProp.GetString() ?? ""; +- if (!string.IsNullOrEmpty(text) && +- payload.TryGetProperty("state", out var state) && +- state.GetString() == "final") ++ if (!string.IsNullOrEmpty(text) && isFinal) + { ++ emittedAssistantText = true; + _logger.Info($"Assistant response: {text.Substring(0, Math.Min(100, text.Length))}..."); ++ EmitChatMessage(sessionKey, roleName ?? "assistant", text, isFinal); + EmitChatNotification(text); + } + } +@@ -833,14 +867,40 @@ private void HandleChatEvent(JsonElement root) + else if (payload.TryGetProperty("text", out var textProp)) + { + var text = textProp.GetString() ?? ""; +- if (payload.TryGetProperty("role", out var role) && +- role.GetString() == "assistant" && ++ var roleName = GetString(payload, "role"); ++ if (roleName == "assistant" && + !string.IsNullOrEmpty(text)) + { ++ emittedAssistantText = true; + _logger.Info($"Assistant response (legacy): {text.Substring(0, Math.Min(100, text.Length))}"); ++ EmitChatMessage(sessionKey, roleName, text, isFinal: true); + EmitChatNotification(text); + } + } ++ ++ if (isFinal && !emittedAssistantText) ++ { ++ RequestChatPreviewForFinalState(sessionKey); ++ } ++ } ++ ++ private void EmitChatMessage(string sessionKey, string role, string text, bool isFinal) ++ { ++ if (isFinal && string.Equals(role, "assistant", StringComparison.OrdinalIgnoreCase)) ++ { ++ lock (_pendingChatPreviewLock) ++ { ++ _lastAssistantMessagesBySession[NormalizeChatSessionKey(sessionKey)] = text; ++ } ++ } ++ ++ ChatMessageReceived?.Invoke(this, new ChatMessageEventArgs ++ { ++ SessionKey = sessionKey, ++ Role = role, ++ Message = text, ++ IsFinal = isFinal ++ }); + } + + private void EmitChatNotification(string text) +@@ -1053,6 +1113,7 @@ private void ParseSessions(JsonElement sessions) + } + + snapshot = GetSessionListInternal(); ++ UpdateDefaultChatSessionKeyFromSessions(); + } + + SessionsUpdated?.Invoke(this, snapshot); +@@ -1081,6 +1142,205 @@ private void ParseSessionItem(JsonElement item) + PopulateSessionFromObject(session, item); + + _sessions[session.Key] = session; ++ if (session.IsMain) ++ { ++ UpdateDefaultChatSessionKey(session.Key); ++ } ++ } ++ ++ private object BuildChatSendParameters(string message, string sessionKey, string idempotencyKey) ++ { ++ return new ++ { ++ message, ++ sessionKey, ++ idempotencyKey ++ }; ++ } ++ ++ private string ResolveChatSessionKey(string? sessionKey) ++ { ++ if (!string.IsNullOrWhiteSpace(sessionKey)) ++ { ++ return NormalizeChatSessionKey(sessionKey); ++ } ++ ++ return string.IsNullOrWhiteSpace(_defaultChatSessionKey) ++ ? DefaultChatSessionKey ++ : _defaultChatSessionKey; ++ } ++ ++ private void UpdateDefaultChatSessionKeyFromHello(JsonElement payload) ++ { ++ if (!payload.TryGetProperty("snapshot", out var snapshot) || ++ snapshot.ValueKind != JsonValueKind.Object || ++ !snapshot.TryGetProperty("sessionDefaults", out var sessionDefaults) || ++ sessionDefaults.ValueKind != JsonValueKind.Object) ++ { ++ return; ++ } ++ ++ var mainSessionKey = GetString(sessionDefaults, "mainKey") ?? ++ GetString(sessionDefaults, "mainSessionKey"); ++ if (!string.IsNullOrWhiteSpace(mainSessionKey)) ++ { ++ UpdateDefaultChatSessionKey(mainSessionKey); ++ } ++ } ++ ++ private void UpdateDefaultChatSessionKeyFromSessions() ++ { ++ var mainSession = _sessions.Values.FirstOrDefault(s => s.IsMain && !string.IsNullOrWhiteSpace(s.Key)); ++ if (!string.IsNullOrWhiteSpace(mainSession?.Key)) ++ { ++ UpdateDefaultChatSessionKey(mainSession.Key); ++ } ++ } ++ ++ private void UpdateDefaultChatSessionKey(string? sessionKey) ++ { ++ if (!string.IsNullOrWhiteSpace(sessionKey)) ++ { ++ _defaultChatSessionKey = NormalizeChatSessionKey(sessionKey); ++ } ++ } ++ ++ private void RequestChatPreviewForFinalState(string sessionKey) ++ { ++ if (string.IsNullOrWhiteSpace(sessionKey) || _sessionPreviewUnsupported) ++ { ++ return; ++ } ++ ++ var normalizedSessionKey = NormalizeChatSessionKey(sessionKey); ++ string? lastKnownAssistantText; ++ lock (_pendingChatPreviewLock) ++ { ++ if (_pendingChatPreviewSessionKeys.ContainsKey(normalizedSessionKey)) ++ { ++ return; ++ } ++ ++ _lastAssistantMessagesBySession.TryGetValue(normalizedSessionKey, out lastKnownAssistantText); ++ _pendingChatPreviewSessionKeys[normalizedSessionKey] = new PendingChatPreviewState ++ { ++ LastKnownAssistantText = lastKnownAssistantText, ++ AttemptCount = 0 ++ }; ++ } ++ ++ RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 0); ++ } ++ ++ private void RequestChatPreviewForFinalStateAsync(string normalizedSessionKey, int delayMs) ++ { ++ _ = Task.Run(async () => ++ { ++ try ++ { ++ if (delayMs > 0) ++ { ++ await Task.Delay(delayMs); ++ } ++ ++ await RequestSessionPreviewAsync([normalizedSessionKey], limit: 2, maxChars: 4000); ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"sessions.preview request failed for {normalizedSessionKey}: {ex.Message}"); ++ lock (_pendingChatPreviewLock) ++ { ++ _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey); ++ } ++ } ++ }); ++ } ++ ++ private void EmitPendingChatPreviewMessages(SessionsPreviewPayloadInfo payload) ++ { ++ foreach (var preview in payload.Previews) ++ { ++ var normalizedSessionKey = NormalizeChatSessionKey(preview.Key); ++ PendingChatPreviewState? pendingState = null; ++ ++ lock (_pendingChatPreviewLock) ++ { ++ _pendingChatPreviewSessionKeys.TryGetValue(normalizedSessionKey, out pendingState); ++ } ++ ++ if (pendingState == null) ++ { ++ continue; ++ } ++ ++ var assistantText = preview.Items ++ .LastOrDefault(item => string.Equals(item.Role, "assistant", StringComparison.OrdinalIgnoreCase))? ++ .Text? ++ .Trim(); ++ ++ if (string.IsNullOrWhiteSpace(assistantText)) ++ { ++ continue; ++ } ++ ++ if (string.Equals(assistantText, pendingState.LastKnownAssistantText, StringComparison.Ordinal)) ++ { ++ if (pendingState.AttemptCount < 3) ++ { ++ pendingState.AttemptCount++; ++ _logger.Warn( ++ $"sessions.preview returned the previous assistant reply for {normalizedSessionKey}; retrying preview ({pendingState.AttemptCount}/3)"); ++ RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 400 * pendingState.AttemptCount); ++ continue; ++ } ++ } ++ ++ lock (_pendingChatPreviewLock) ++ { ++ _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey); ++ } ++ ++ _logger.Info($"Assistant response (preview): {assistantText.Substring(0, Math.Min(100, assistantText.Length))}..."); ++ EmitChatMessage(normalizedSessionKey, "assistant", assistantText, isFinal: true); ++ EmitChatNotification(assistantText); ++ } ++ } ++ ++ private void ClearPendingChatPreviewSessions() ++ { ++ lock (_pendingChatPreviewLock) ++ { ++ _pendingChatPreviewSessionKeys.Clear(); ++ _lastAssistantMessagesBySession.Clear(); ++ } ++ } ++ ++ private static string NormalizeChatSessionKey(string? sessionKey) ++ { ++ if (string.IsNullOrWhiteSpace(sessionKey)) ++ { ++ return DefaultChatSessionKey; ++ } ++ ++ return sessionKey == "main" || sessionKey.Contains(":main:", StringComparison.Ordinal) ++ ? DefaultChatSessionKey ++ : sessionKey; ++ } ++ ++ private static string? TryGetSessionKey(JsonElement root, JsonElement payload) ++ { ++ if (root.TryGetProperty("sessionKey", out var rootSessionKey)) ++ { ++ return rootSessionKey.GetString(); ++ } ++ ++ if (payload.ValueKind == JsonValueKind.Object && ++ payload.TryGetProperty("sessionKey", out var payloadSessionKey)) ++ { ++ return payloadSessionKey.GetString(); ++ } ++ ++ return null; + } + + private void PopulateSessionFromObject(SessionInfo session, JsonElement item) +@@ -1394,6 +1654,7 @@ private void ParseSessionsPreview(JsonElement payload) + } + + SessionPreviewUpdated?.Invoke(this, previewPayload); ++ EmitPendingChatPreviewMessages(previewPayload); + } + catch (Exception ex) + { +diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs +index 4c7b075..56d8e4c 100644 +--- a/src/OpenClaw.Shared/SettingsData.cs ++++ b/src/OpenClaw.Shared/SettingsData.cs +@@ -1,3 +1,5 @@ ++using System; ++using System.Text.Json.Serialization; + using System.Text.Json; + + namespace OpenClaw.Shared; +@@ -26,6 +28,11 @@ public class SettingsData + public bool NotifyChatResponses { get; set; } = true; + public bool PreferStructuredCategories { get; set; } = true; + public List? UserRules { get; set; } ++ public VoiceSettings Voice { get; set; } = new(); ++ public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new(); ++ public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new(); ++ [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] ++ public VoiceProviderCredentials? VoiceProviderCredentials { get; set; } + + private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true }; + +@@ -35,11 +42,39 @@ public class SettingsData + { + try + { +- return JsonSerializer.Deserialize(json); ++ return JsonSerializer.Deserialize(MigrateLegacyVoiceJson(json)); + } + catch + { + return null; + } + } ++ ++ private static string MigrateLegacyVoiceJson(string json) ++ { ++ return json ++ .Replace("\"WakeWord\":", "\"VoiceWake\":", StringComparison.Ordinal) ++ .Replace("\"AlwaysOn\":", "\"TalkMode\":", StringComparison.Ordinal) ++ .Replace("\"WakeWordModelId\":", "\"VoiceWakeModelId\":", StringComparison.Ordinal) ++ .Replace("\"WakeWordLoaded\":", "\"VoiceWakeLoaded\":", StringComparison.Ordinal) ++ .Replace("\"LastWakeWordUtc\":", "\"LastVoiceWakeUtc\":", StringComparison.Ordinal) ++ .Replace("\"Mode\":\"WakeWord\"", "\"Mode\":\"VoiceWake\"", StringComparison.Ordinal) ++ .Replace("\"Mode\": \"WakeWord\"", "\"Mode\": \"VoiceWake\"", StringComparison.Ordinal) ++ .Replace("\"Mode\":\"AlwaysOn\"", "\"Mode\":\"TalkMode\"", StringComparison.Ordinal) ++ .Replace("\"Mode\": \"AlwaysOn\"", "\"Mode\": \"TalkMode\"", StringComparison.Ordinal) ++ .Replace("\"State\":\"ListeningForWakeWord\"", "\"State\":\"ListeningForVoiceWake\"", StringComparison.Ordinal) ++ .Replace("\"State\": \"ListeningForWakeWord\"", "\"State\": \"ListeningForVoiceWake\"", StringComparison.Ordinal); ++ } ++} ++ ++public sealed class VoiceRepeaterWindowSettings ++{ ++ public bool AutoScroll { get; set; } = true; ++ public bool FloatingEnabled { get; set; } = true; ++ public bool HasSavedPlacement { get; set; } ++ public double TextSize { get; set; } = 13; ++ public int? Width { get; set; } ++ public int? Height { get; set; } ++ public int? X { get; set; } ++ public int? Y { get; set; } + } +diff --git a/src/OpenClaw.Shared/VoiceModeSchema.cs b/src/OpenClaw.Shared/VoiceModeSchema.cs +new file mode 100644 +index 0000000..e47af8c +--- /dev/null ++++ b/src/OpenClaw.Shared/VoiceModeSchema.cs +@@ -0,0 +1,354 @@ ++using System; ++using System.Collections.ObjectModel; ++using System.Text.Json; ++using System.Text.Json.Serialization; ++ ++namespace OpenClaw.Shared; ++ ++public static class VoiceCommands ++{ ++ public const string ListDevices = "voice.devices.list"; ++ public const string GetSettings = "voice.settings.get"; ++ public const string SetSettings = "voice.settings.set"; ++ public const string GetStatus = "voice.status.get"; ++ public const string Start = "voice.start"; ++ public const string Stop = "voice.stop"; ++ public const string Pause = "voice.pause"; ++ public const string Resume = "voice.resume"; ++ public const string Skip = "voice.response.skip"; ++ ++ private static readonly ReadOnlyCollection s_all = Array.AsReadOnly( ++ [ ++ ListDevices, ++ GetSettings, ++ SetSettings, ++ GetStatus, ++ Start, ++ Stop, ++ Pause, ++ Resume, ++ Skip ++ ]); ++ ++ public static IReadOnlyList All => s_all; ++} ++ ++[JsonConverter(typeof(VoiceActivationModeJsonConverter))] ++public enum VoiceActivationMode ++{ ++ Off, ++ VoiceWake, ++ TalkMode ++} ++ ++[JsonConverter(typeof(VoiceRuntimeStateJsonConverter))] ++public enum VoiceRuntimeState ++{ ++ Stopped, ++ Paused, ++ Idle, ++ Arming, ++ ListeningForVoiceWake, ++ ListeningContinuously, ++ RecordingUtterance, ++ SubmittingAudio, ++ AwaitingResponse, ++ PlayingResponse, ++ Error ++} ++ ++public sealed class VoiceSettings ++{ ++ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; ++ public bool Enabled { get; set; } ++ public bool ShowRepeaterAtStartup { get; set; } = true; ++ public bool ShowConversationToasts { get; set; } ++ public string SpeechToTextProviderId { get; set; } = VoiceProviderIds.Windows; ++ public string TextToSpeechProviderId { get; set; } = VoiceProviderIds.Windows; ++ public string? InputDeviceId { get; set; } ++ public string? OutputDeviceId { get; set; } ++ public int SampleRateHz { get; set; } = 16000; ++ public int CaptureChunkMs { get; set; } = 80; ++ public bool BargeInEnabled { get; set; } = true; ++ public VoiceWakeSettings VoiceWake { get; set; } = new(); ++ public TalkModeSettings TalkMode { get; set; } = new(); ++} ++ ++public sealed class VoiceWakeSettings ++{ ++ public string Engine { get; set; } = "NanoWakeWord"; ++ public string ModelId { get; set; } = "hey_openclaw"; ++ public float TriggerThreshold { get; set; } = 0.65f; ++ public int TriggerCooldownMs { get; set; } = 2000; ++ public int PreRollMs { get; set; } = 1200; ++ public int EndSilenceMs { get; set; } = 900; ++} ++ ++public sealed class TalkModeSettings ++{ ++ public int MinSpeechMs { get; set; } = 250; ++ public int EndSilenceMs { get; set; } = 900; ++ public int MaxUtteranceMs { get; set; } = 15000; ++} ++ ++public sealed class VoiceAudioDeviceInfo ++{ ++ public string DeviceId { get; set; } = ""; ++ public string Name { get; set; } = ""; ++ public bool IsDefault { get; set; } ++ public bool IsInput { get; set; } ++ public bool IsOutput { get; set; } ++} ++ ++public sealed class VoiceStatusInfo ++{ ++ public bool Available { get; set; } ++ public bool Running { get; set; } ++ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; ++ public VoiceRuntimeState State { get; set; } = VoiceRuntimeState.Stopped; ++ public string? SessionKey { get; set; } ++ public string? InputDeviceId { get; set; } ++ public string? OutputDeviceId { get; set; } ++ public string? VoiceWakeModelId { get; set; } ++ public bool VoiceWakeLoaded { get; set; } ++ public DateTime? LastVoiceWakeUtc { get; set; } ++ public DateTime? LastUtteranceUtc { get; set; } ++ public int PendingReplyCount { get; set; } ++ public bool CanSkipReply { get; set; } ++ public string? CurrentReplyPreview { get; set; } ++ public string? LastError { get; set; } ++} ++ ++public sealed class VoiceStartArgs ++{ ++ public VoiceActivationMode? Mode { get; set; } ++ public string? SessionKey { get; set; } ++} ++ ++public sealed class VoiceStopArgs ++{ ++ public string? Reason { get; set; } ++} ++ ++public sealed class VoicePauseArgs ++{ ++ public string? Reason { get; set; } ++} ++ ++public sealed class VoiceResumeArgs ++{ ++ public string? Reason { get; set; } ++} ++ ++public sealed class VoiceSkipArgs ++{ ++ public string? Reason { get; set; } ++} ++ ++public sealed class VoiceSettingsUpdateArgs ++{ ++ public VoiceSettings Settings { get; set; } = new(); ++ public bool Persist { get; set; } = true; ++} ++ ++public static class VoiceProviderIds ++{ ++ public const string Windows = "windows"; ++ public const string HttpWs = "http-ws"; ++ public const string FoundryLocal = "foundry-local"; ++ public const string OpenAiWhisper = "openai-whisper"; ++ public const string ElevenLabsSpeechToText = "elevenlabs-stt"; ++ public const string AzureAiSpeech = "azure-ai-speech"; ++ public const string SherpaOnnx = "sherpa-onnx"; ++ public const string MiniMax = "minimax"; ++ public const string ElevenLabs = "elevenlabs"; ++} ++ ++public static class VoiceProviderRuntimeIds ++{ ++ public const string Windows = "windows"; ++ public const string Streaming = "streaming"; ++ public const string Embedded = "embedded"; ++ public const string Cloud = "cloud"; ++} ++ ++public static class VoiceProviderSettingKeys ++{ ++ public const string ApiKey = "apiKey"; ++ public const string Endpoint = "endpoint"; ++ public const string Model = "model"; ++ public const string ModelPath = "modelPath"; ++ public const string VoiceId = "voiceId"; ++ public const string VoiceSettingsJson = "voiceSettingsJson"; ++} ++ ++public static class VoiceTextToSpeechResponseModes ++{ ++ public const string Binary = "binary"; ++ public const string HexJsonString = "hexJsonString"; ++ public const string Base64JsonString = "base64JsonString"; ++} ++ ++public sealed class VoiceProviderCredentials ++{ ++ public string? MiniMaxApiKey { get; set; } ++ public string MiniMaxModel { get; set; } = "speech-2.8-turbo"; ++ public string MiniMaxVoiceId { get; set; } = "English_MatureBoss"; ++ public string? ElevenLabsApiKey { get; set; } ++ public string? ElevenLabsModel { get; set; } ++ public string? ElevenLabsVoiceId { get; set; } ++} ++ ++public sealed class VoiceProviderConfigurationStore ++{ ++ public List Providers { get; set; } = []; ++} ++ ++public sealed class VoiceProviderConfiguration ++{ ++ public string ProviderId { get; set; } = ""; ++ public Dictionary Values { get; set; } = []; ++} ++ ++public sealed class VoiceProviderSettingDefinition ++{ ++ public string Key { get; set; } = ""; ++ public string Label { get; set; } = ""; ++ public bool Secret { get; set; } ++ public bool Required { get; set; } = true; ++ public bool JsonValue { get; set; } ++ public string? DefaultValue { get; set; } ++ public string? Placeholder { get; set; } ++ public string? Description { get; set; } ++ public List Options { get; set; } = []; ++} ++ ++public sealed class VoiceTextToSpeechHttpContract ++{ ++ public string EndpointTemplate { get; set; } = ""; ++ public string HttpMethod { get; set; } = "POST"; ++ public string AuthenticationHeaderName { get; set; } = "Authorization"; ++ public string? AuthenticationScheme { get; set; } = "Bearer"; ++ public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey; ++ public string RequestContentType { get; set; } = "application/json"; ++ public string RequestBodyTemplate { get; set; } = ""; ++ public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary; ++ public string? ResponseAudioJsonPath { get; set; } ++ public string? ResponseStatusCodeJsonPath { get; set; } ++ public string? ResponseStatusMessageJsonPath { get; set; } ++ public string? SuccessStatusValue { get; set; } ++ public string OutputContentType { get; set; } = "audio/mpeg"; ++} ++ ++public sealed class VoiceTextToSpeechWebSocketContract ++{ ++ public string EndpointTemplate { get; set; } = ""; ++ public string AuthenticationHeaderName { get; set; } = "Authorization"; ++ public string? AuthenticationScheme { get; set; } = "Bearer"; ++ public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey; ++ public string ConnectSuccessEventName { get; set; } = "connected_success"; ++ public string StartMessageTemplate { get; set; } = ""; ++ public string StartSuccessEventName { get; set; } = "task_started"; ++ public string ContinueMessageTemplate { get; set; } = ""; ++ public string FinishMessageTemplate { get; set; } = "{ \"event\": \"task_finish\" }"; ++ public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary; ++ public string? ResponseAudioJsonPath { get; set; } = "data.audio"; ++ public string? ResponseStatusCodeJsonPath { get; set; } = "base_resp.status_code"; ++ public string? ResponseStatusMessageJsonPath { get; set; } = "base_resp.status_msg"; ++ public string? FinalFlagJsonPath { get; set; } = "is_final"; ++ public string TaskFailedEventName { get; set; } = "task_failed"; ++ public string? SuccessStatusValue { get; set; } = "0"; ++ public string OutputContentType { get; set; } = "audio/mpeg"; ++} ++ ++public sealed class VoiceProviderOption ++{ ++ public string Id { get; set; } = ""; ++ public string Name { get; set; } = ""; ++ public string Runtime { get; set; } = VoiceProviderRuntimeIds.Windows; ++ public bool Enabled { get; set; } = true; ++ public bool VisibleInSettings { get; set; } = true; ++ public bool Selectable { get; set; } = true; ++ public string? Description { get; set; } ++ public List Settings { get; set; } = []; ++ public VoiceTextToSpeechHttpContract? TextToSpeechHttp { get; set; } ++ public VoiceTextToSpeechWebSocketContract? TextToSpeechWebSocket { get; set; } ++ ++ [JsonIgnore] ++ public string DisplayName => Selectable ? Name : $"{Name} (coming soon)"; ++ ++ [JsonIgnore] ++ public double DisplayOpacity => Selectable ? 1.0 : 0.55; ++} ++ ++public sealed class VoiceProviderCatalog ++{ ++ public List SpeechToTextProviders { get; set; } = []; ++ public List TextToSpeechProviders { get; set; } = []; ++} ++ ++public sealed class VoiceActivationModeJsonConverter : JsonConverter ++{ ++ public override VoiceActivationMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) ++ { ++ var value = reader.GetString(); ++ return value switch ++ { ++ "VoiceWake" or "WakeWord" => VoiceActivationMode.VoiceWake, ++ "TalkMode" or "AlwaysOn" => VoiceActivationMode.TalkMode, ++ _ => VoiceActivationMode.Off ++ }; ++ } ++ ++ public override void Write(Utf8JsonWriter writer, VoiceActivationMode value, JsonSerializerOptions options) ++ { ++ writer.WriteStringValue(value switch ++ { ++ VoiceActivationMode.VoiceWake => "VoiceWake", ++ VoiceActivationMode.TalkMode => "TalkMode", ++ _ => "Off" ++ }); ++ } ++} ++ ++public sealed class VoiceRuntimeStateJsonConverter : JsonConverter ++{ ++ public override VoiceRuntimeState Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) ++ { ++ var value = reader.GetString(); ++ return value switch ++ { ++ "ListeningForVoiceWake" or "ListeningForWakeWord" => VoiceRuntimeState.ListeningForVoiceWake, ++ "Stopped" => VoiceRuntimeState.Stopped, ++ "Paused" => VoiceRuntimeState.Paused, ++ "Idle" => VoiceRuntimeState.Idle, ++ "Arming" => VoiceRuntimeState.Arming, ++ "ListeningContinuously" => VoiceRuntimeState.ListeningContinuously, ++ "RecordingUtterance" => VoiceRuntimeState.RecordingUtterance, ++ "SubmittingAudio" => VoiceRuntimeState.SubmittingAudio, ++ "AwaitingResponse" => VoiceRuntimeState.AwaitingResponse, ++ "PlayingResponse" => VoiceRuntimeState.PlayingResponse, ++ "Error" => VoiceRuntimeState.Error, ++ _ => VoiceRuntimeState.Stopped ++ }; ++ } ++ ++ public override void Write(Utf8JsonWriter writer, VoiceRuntimeState value, JsonSerializerOptions options) ++ { ++ writer.WriteStringValue(value switch ++ { ++ VoiceRuntimeState.ListeningForVoiceWake => "ListeningForVoiceWake", ++ VoiceRuntimeState.Stopped => "Stopped", ++ VoiceRuntimeState.Paused => "Paused", ++ VoiceRuntimeState.Idle => "Idle", ++ VoiceRuntimeState.Arming => "Arming", ++ VoiceRuntimeState.ListeningContinuously => "ListeningContinuously", ++ VoiceRuntimeState.RecordingUtterance => "RecordingUtterance", ++ VoiceRuntimeState.SubmittingAudio => "SubmittingAudio", ++ VoiceRuntimeState.AwaitingResponse => "AwaitingResponse", ++ VoiceRuntimeState.PlayingResponse => "PlayingResponse", ++ VoiceRuntimeState.Error => "Error", ++ _ => "Stopped" ++ }); ++ } ++} +diff --git a/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs +new file mode 100644 +index 0000000..b1dfa41 +--- /dev/null ++++ b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs +@@ -0,0 +1,161 @@ ++using System; ++using System.Collections.Generic; ++using System.Linq; ++ ++namespace OpenClaw.Shared; ++ ++public static class VoiceProviderConfigurationStoreExtensions ++{ ++ public static VoiceProviderConfiguration GetOrAddProvider( ++ this VoiceProviderConfigurationStore store, ++ string providerId) ++ { ++ ArgumentNullException.ThrowIfNull(store); ++ ++ var existing = store.Providers.FirstOrDefault(p => ++ string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase)); ++ if (existing != null) ++ { ++ return existing; ++ } ++ ++ var created = new VoiceProviderConfiguration ++ { ++ ProviderId = providerId ++ }; ++ store.Providers.Add(created); ++ return created; ++ } ++ ++ public static VoiceProviderConfiguration? FindProvider( ++ this VoiceProviderConfigurationStore store, ++ string? providerId) ++ { ++ ArgumentNullException.ThrowIfNull(store); ++ ++ if (string.IsNullOrWhiteSpace(providerId)) ++ { ++ return null; ++ } ++ ++ return store.Providers.FirstOrDefault(p => ++ string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase)); ++ } ++ ++ public static string? GetValue( ++ this VoiceProviderConfigurationStore store, ++ string? providerId, ++ string settingKey) ++ { ++ return store.FindProvider(providerId)?.GetValue(settingKey); ++ } ++ ++ public static string? GetValue(this VoiceProviderConfiguration configuration, string settingKey) ++ { ++ ArgumentNullException.ThrowIfNull(configuration); ++ ++ if (string.IsNullOrWhiteSpace(settingKey)) ++ { ++ return null; ++ } ++ ++ return configuration.Values.FirstOrDefault(entry => ++ string.Equals(entry.Key, settingKey, StringComparison.OrdinalIgnoreCase)).Value; ++ } ++ ++ public static void SetValue( ++ this VoiceProviderConfigurationStore store, ++ string providerId, ++ string settingKey, ++ string? value) ++ { ++ ArgumentNullException.ThrowIfNull(store); ++ ++ var provider = store.GetOrAddProvider(providerId); ++ provider.SetValue(settingKey, value); ++ } ++ ++ public static void SetValue( ++ this VoiceProviderConfiguration configuration, ++ string settingKey, ++ string? value) ++ { ++ ArgumentNullException.ThrowIfNull(configuration); ++ ++ if (string.IsNullOrWhiteSpace(settingKey)) ++ { ++ return; ++ } ++ ++ var existingKey = configuration.Values.Keys.FirstOrDefault(key => ++ string.Equals(key, settingKey, StringComparison.OrdinalIgnoreCase)); ++ ++ if (string.IsNullOrWhiteSpace(value)) ++ { ++ if (existingKey != null) ++ { ++ configuration.Values.Remove(existingKey); ++ } ++ ++ return; ++ } ++ ++ if (existingKey != null) ++ { ++ configuration.Values[existingKey] = value.Trim(); ++ return; ++ } ++ ++ configuration.Values[settingKey] = value.Trim(); ++ } ++ ++ public static void MigrateLegacyCredentials( ++ this VoiceProviderConfigurationStore store, ++ VoiceProviderCredentials? legacy) ++ { ++ ArgumentNullException.ThrowIfNull(store); ++ ++ if (legacy == null) ++ { ++ return; ++ } ++ ++ var hasMiniMaxValues = ++ !string.IsNullOrWhiteSpace(legacy.MiniMaxApiKey) || ++ !string.IsNullOrWhiteSpace(legacy.MiniMaxModel) || ++ !string.IsNullOrWhiteSpace(legacy.MiniMaxVoiceId); ++ if (hasMiniMaxValues) ++ { ++ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey, legacy.MiniMaxApiKey); ++ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model, legacy.MiniMaxModel); ++ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId, legacy.MiniMaxVoiceId); ++ } ++ ++ var hasElevenLabsValues = ++ !string.IsNullOrWhiteSpace(legacy.ElevenLabsApiKey) || ++ !string.IsNullOrWhiteSpace(legacy.ElevenLabsModel) || ++ !string.IsNullOrWhiteSpace(legacy.ElevenLabsVoiceId); ++ if (hasElevenLabsValues) ++ { ++ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey, legacy.ElevenLabsApiKey); ++ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model, legacy.ElevenLabsModel); ++ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId, legacy.ElevenLabsVoiceId); ++ } ++ } ++ ++ public static VoiceProviderConfigurationStore Clone(this VoiceProviderConfigurationStore source) ++ { ++ ArgumentNullException.ThrowIfNull(source); ++ ++ return new VoiceProviderConfigurationStore ++ { ++ Providers = source.Providers ++ .Select(provider => new VoiceProviderConfiguration ++ { ++ ProviderId = provider.ProviderId, ++ Values = new Dictionary(provider.Values, StringComparer.OrdinalIgnoreCase) ++ }) ++ .ToList() ++ }; ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs +index de0780f..5b7fcc8 100644 +--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs +@@ -6,6 +6,7 @@ + using OpenClawTray.Dialogs; + using OpenClawTray.Helpers; + using OpenClawTray.Services; ++using OpenClawTray.Services.Voice; + using OpenClawTray.Windows; + using System; + using System.Collections.Generic; +@@ -37,6 +38,7 @@ public partial class App : Application + private GlobalHotkeyService? _globalHotkey; + private System.Timers.Timer? _healthCheckTimer; + private System.Timers.Timer? _sessionPollTimer; ++ private Microsoft.UI.Dispatching.DispatcherQueueTimer? _voiceTrayIconTimer; + private Mutex? _mutex; + private Microsoft.UI.Dispatching.DispatcherQueue? _dispatcherQueue; + private CancellationTokenSource? _deepLinkCts; +@@ -54,6 +56,7 @@ public partial class App : Application + private GatewayCostUsageInfo? _lastUsageCost; + private DateTime _lastCheckTime = DateTime.Now; + private DateTime _lastUsageActivityLogUtc = DateTime.MinValue; ++ private string? _lastTrayIconPath; + + // Session-aware activity tracking + private readonly Dictionary _sessionActivities = new(); +@@ -63,6 +66,8 @@ public partial class App : Application + + // Windows (created on demand) + private SettingsWindow? _settingsWindow; ++ private VoiceRepeaterWindow? _voiceRepeaterWindow; ++ private VoiceModeWindow? _voiceModeWindow; + private WebChatWindow? _webChatWindow; + private StatusDetailWindow? _statusDetailWindow; + private NotificationHistoryWindow? _notificationHistoryWindow; +@@ -72,6 +77,8 @@ public partial class App : Application + + // Node service (optional, enabled in settings) + private NodeService? _nodeService; ++ private VoiceService? _voiceService; ++ private VoiceChatCoordinator? _voiceChatCoordinator; + + // Keep-alive window to anchor WinUI runtime (prevents GC/threading issues) + private Window? _keepAliveWindow; +@@ -250,6 +257,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + + // Initialize settings + _settings = new SettingsManager(); ++ _voiceService = new VoiceService(new AppLogger(), _settings); ++ _voiceChatCoordinator = new VoiceChatCoordinator( ++ _voiceService, ++ new DispatcherQueueAdapter(_dispatcherQueue!)); ++ _voiceChatCoordinator.ConversationTurnAvailable += OnVoiceConversationTurnAvailable; + + // First-run check + if (string.IsNullOrWhiteSpace(_settings.Token)) +@@ -276,6 +288,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + + // Start health check timer + StartHealthCheckTimer(); ++ StartVoiceTrayIconTimer(); + + // Start deep link server + StartDeepLinkServer(); +@@ -284,7 +297,8 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + if (_settings.GlobalHotkeyEnabled) + { + _globalHotkey = new GlobalHotkeyService(); +- _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; ++ _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed; ++ _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed; + _globalHotkey.Register(); + } + +@@ -297,6 +311,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + HandleDeepLink(startupDeepLink); + } + ++ if (ShouldShowVoiceRepeaterAtStartup()) ++ { ++ _dispatcherQueue?.TryEnqueue(ShowVoiceModeSettings); ++ } ++ + Logger.Info("Application started (WinUI 3)"); + } + +@@ -322,11 +341,26 @@ private void InitializeTrayIcon() + + var iconPath = IconHelper.GetStatusIconPath(ConnectionStatus.Disconnected); + _trayIcon = new TrayIcon(1, iconPath, "OpenClaw Tray — Disconnected"); ++ _lastTrayIconPath = iconPath; + _trayIcon.IsVisible = true; + _trayIcon.Selected += OnTrayIconSelected; + _trayIcon.ContextMenu += OnTrayContextMenu; + } + ++ private void StartVoiceTrayIconTimer() ++ { ++ if (_dispatcherQueue == null || _voiceTrayIconTimer != null) ++ { ++ return; ++ } ++ ++ _voiceTrayIconTimer = _dispatcherQueue.CreateTimer(); ++ _voiceTrayIconTimer.Interval = TimeSpan.FromMilliseconds(250); ++ _voiceTrayIconTimer.IsRepeating = true; ++ _voiceTrayIconTimer.Tick += (s, e) => UpdateTrayIcon(); ++ _voiceTrayIconTimer.Start(); ++ } ++ + private void InitializeTrayMenuWindow() + { + // Pre-create menu window once - reuse to avoid crash on window creation after idle +@@ -514,6 +548,8 @@ private void OnTrayMenuItemClicked(object? sender, string action) + switch (action) + { + case "status": ShowStatusDetail(); break; ++ case "voice-settings": ShowVoiceModeSettings(); break; ++ case "voice-toggle-pause": _ = ToggleVoiceQuickPauseAsync(); break; + case "dashboard": OpenDashboard(); break; + case "webchat": ShowWebChat(); break; + case "quicksend": ShowQuickSend(); break; +@@ -725,6 +761,60 @@ private List GetRecentActivity(int maxItems) + .ToList(); + } + ++ private string GetRunningVoiceModeLabel() ++ { ++ var status = _voiceService?.CurrentStatus; ++ if (status == null) ++ { ++ return "Off"; ++ } ++ ++ return VoiceDisplayHelper.GetRuntimeLabel(status); ++ } ++ ++ private bool CanQuickToggleVoiceMode() ++ { ++ if (_settings?.EnableNodeMode != true || _voiceService == null) ++ { ++ return false; ++ } ++ ++ var status = _voiceService.CurrentStatus; ++ if (status.State == VoiceRuntimeState.Paused) ++ { ++ return true; ++ } ++ ++ return _settings.Voice.Enabled && _settings.Voice.Mode != VoiceActivationMode.Off; ++ } ++ ++ private bool ShouldShowVoiceRepeaterAtStartup() ++ { ++ return _settings?.EnableNodeMode == true && ++ _settings.Voice.Enabled && ++ _settings.Voice.Mode != VoiceActivationMode.Off && ++ _settings.Voice.ShowRepeaterAtStartup; ++ } ++ ++ private string GetVoiceQuickToggleLabel() ++ { ++ var status = _voiceService?.CurrentStatus; ++ return status?.State == VoiceRuntimeState.Paused ++ ? "Resume Voice" ++ : "Pause Voice"; ++ } ++ ++ private string GetVoiceDeviceSummary() ++ { ++ var voice = _settings?.Voice; ++ if (voice == null) ++ return "Talk: system default · Listen: system default"; ++ ++ var talk = string.IsNullOrWhiteSpace(voice.OutputDeviceId) ? "system default" : "selected speaker"; ++ var listen = string.IsNullOrWhiteSpace(voice.InputDeviceId) ? "system default" : "selected microphone"; ++ return $"Talk: {talk} · Listen: {listen}"; ++ } ++ + private void BuildTrayMenuPopup(TrayMenuWindow menu) + { + // Brand header +@@ -741,6 +831,14 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu) + menu.AddMenuItem(_currentActivity.DisplayText, _currentActivity.Glyph, "", isEnabled: false); + } + ++ menu.AddMenuItem($"Voice Mode: {GetRunningVoiceModeLabel()}", "🎙️", "voice-settings"); ++ menu.AddMenuItem($"↳ {GetVoiceDeviceSummary()}", "", "", isEnabled: false, indent: true); ++ menu.AddMenuItem($"↳ {GetVoiceQuickToggleLabel()} (Ctrl+Alt+Shift+V)", "", "voice-toggle-pause", isEnabled: CanQuickToggleVoiceMode(), indent: true); ++ if (_settings?.EnableNodeMode != true) ++ { ++ menu.AddMenuItem("↳ Enable Node Mode to activate voice runtime", "", "", isEnabled: false, indent: true); ++ } ++ + // Usage + if (_lastUsage != null || _lastUsageStatus != null || _lastUsageCost != null) + { +@@ -1126,7 +1224,7 @@ private void InitializeNodeService() + { + Logger.Info("Initializing Windows Node service..."); + +- _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, DataPath); ++ _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, _voiceService!, DataPath); + _nodeService.StatusChanged += OnNodeStatusChanged; + _nodeService.NotificationRequested += OnNodeNotificationRequested; + _nodeService.PairingStatusChanged += OnPairingStatusChanged; +@@ -1555,13 +1653,7 @@ private void UpdateTrayIcon() + { + if (_trayIcon == null) return; + +- var status = _currentStatus; +- if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle) +- { +- status = ConnectionStatus.Connecting; // Use connecting icon for activity +- } +- +- var iconPath = IconHelper.GetStatusIconPath(status); ++ var iconPath = GetTrayIconPathForCurrentState(); + var tooltip = $"OpenClaw Tray — {_currentStatus}"; + + if (_currentActivity != null && !string.IsNullOrEmpty(_currentActivity.DisplayText)) +@@ -1573,7 +1665,11 @@ private void UpdateTrayIcon() + + try + { +- _trayIcon.SetIcon(iconPath); ++ if (!string.Equals(_lastTrayIconPath, iconPath, StringComparison.OrdinalIgnoreCase)) ++ { ++ _trayIcon.SetIcon(iconPath); ++ _lastTrayIconPath = iconPath; ++ } + _trayIcon.Tooltip = tooltip; + } + catch (Exception ex) +@@ -1582,15 +1678,60 @@ private void UpdateTrayIcon() + } + } + ++ private string GetTrayIconPathForCurrentState() ++ { ++ var voiceIconState = GetVoiceTrayIconState(); ++ if (voiceIconState != VoiceTrayIconState.Off) ++ { ++ return IconHelper.GetVoiceTrayIconPath(voiceIconState); ++ } ++ ++ if (_voiceService?.CurrentStatus.State == VoiceRuntimeState.Paused) ++ { ++ return IconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off); ++ } ++ ++ var status = _currentStatus; ++ if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle) ++ { ++ status = ConnectionStatus.Connecting; ++ } ++ ++ return IconHelper.GetStatusIconPath(status); ++ } ++ ++ private VoiceTrayIconState GetVoiceTrayIconState() ++ { ++ var voiceStatus = _voiceService?.CurrentStatus; ++ if (voiceStatus == null || !voiceStatus.Running) ++ { ++ return VoiceTrayIconState.Off; ++ } ++ ++ return voiceStatus.State switch ++ { ++ VoiceRuntimeState.PlayingResponse => VoiceTrayIconState.Speaking, ++ VoiceRuntimeState.ListeningForVoiceWake => VoiceTrayIconState.Listening, ++ VoiceRuntimeState.ListeningContinuously => VoiceTrayIconState.Listening, ++ VoiceRuntimeState.RecordingUtterance => VoiceTrayIconState.Listening, ++ VoiceRuntimeState.Paused => VoiceTrayIconState.Off, ++ _ when voiceStatus.Mode == VoiceActivationMode.Off => VoiceTrayIconState.Off, ++ _ => VoiceTrayIconState.Off ++ }; ++ } ++ + #endregion + + #region Window Management + + private void ShowSettings() + { ++ if (_settings == null || _voiceService == null) ++ return; ++ + if (_settingsWindow == null || _settingsWindow.IsClosed) + { +- _settingsWindow = new SettingsWindow(_settings!); ++ _settingsWindow = new SettingsWindow(_settings, _voiceService); + _settingsWindow.Closed += (s, e) => + { + _settingsWindow.SettingsSaved -= OnSettingsSaved; +@@ -1601,30 +1742,134 @@ private void ShowSettings() + _settingsWindow.Activate(); + } + +- private void OnSettingsSaved(object? sender, EventArgs e) ++ private void ShowVoiceModeSettings() ++ { ++ if (_settings == null || _voiceService == null) ++ return; ++ ++ if (_voiceRepeaterWindow == null || _voiceRepeaterWindow.IsClosed) ++ { ++ _voiceRepeaterWindow = new VoiceRepeaterWindow(_settings, _voiceService); ++ _voiceRepeaterWindow.OpenVoiceStatusRequested += OnOpenVoiceStatusRequested; ++ _voiceRepeaterWindow.Closed += (s, e) => ++ { ++ _voiceChatCoordinator?.DetachWindow(_voiceRepeaterWindow); ++ _voiceRepeaterWindow.OpenVoiceStatusRequested -= OnOpenVoiceStatusRequested; ++ _voiceRepeaterWindow = null; ++ }; ++ _voiceChatCoordinator?.AttachWindow(_voiceRepeaterWindow); ++ } ++ ++ _voiceRepeaterWindow.RefreshStatus(); ++ _voiceRepeaterWindow.Activate(); ++ } ++ ++ private void ShowVoiceStatusWindow() ++ { ++ if (_settings == null || _voiceService == null) ++ { ++ return; ++ } ++ ++ if (_voiceModeWindow == null || _voiceModeWindow.IsClosed) ++ { ++ _voiceModeWindow = new VoiceModeWindow(_settings, _voiceService, _voiceService); ++ _voiceModeWindow.OpenSettingsRequested += OnVoiceModeOpenSettingsRequested; ++ _voiceModeWindow.Closed += (s, e) => ++ { ++ if (_voiceModeWindow != null) ++ { ++ _voiceModeWindow.OpenSettingsRequested -= OnVoiceModeOpenSettingsRequested; ++ } ++ ++ _voiceModeWindow = null; ++ }; ++ } ++ ++ _voiceModeWindow.RefreshStatus(); ++ _voiceModeWindow.Activate(); ++ } ++ ++ private void OnOpenVoiceStatusRequested(object? sender, EventArgs e) ++ { ++ ShowVoiceStatusWindow(); ++ } ++ ++ private void OnVoiceModeOpenSettingsRequested(object? sender, EventArgs e) ++ { ++ ShowSettings(); ++ } ++ ++ private async void OnSettingsSaved(object? sender, EventArgs e) + { + // Reconnect with new settings — mirror the startup if/else pattern + // to avoid dual connections that cause gateway conflicts. +- _gatewayClient?.Dispose(); +- var oldNodeService = _nodeService; +- _nodeService = null; +- try { oldNodeService?.Dispose(); } catch (Exception ex) { Logger.Warn($"Node dispose error: {ex.Message}"); } +- ++ try ++ { ++ if (_gatewayClient != null) ++ { ++ try ++ { ++ await _gatewayClient.DisconnectAsync(); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Gateway disconnect error: {ex.Message}"); ++ } ++ ++ _gatewayClient.Dispose(); ++ _gatewayClient = null; ++ } ++ ++ var oldNodeService = _nodeService; ++ _nodeService = null; ++ if (oldNodeService != null) ++ { ++ try ++ { ++ await oldNodeService.DisconnectAsync(); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Node disconnect error: {ex.Message}"); ++ } ++ ++ try ++ { ++ oldNodeService.Dispose(); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Node dispose error: {ex.Message}"); ++ } ++ } ++ + if (_settings?.EnableNodeMode == true) + { + InitializeNodeService(); + } +- else ++ else ++ { ++ InitializeGatewayClient(); ++ if (_voiceService != null) ++ { ++ await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node mode disabled" }); ++ } ++ } ++ } ++ catch (Exception ex) + { +- InitializeGatewayClient(); ++ Logger.Warn($"Settings reconnect failed: {ex.Message}"); + } + + // Update global hotkey + if (_settings!.GlobalHotkeyEnabled) + { + _globalHotkey ??= new GlobalHotkeyService(); +- _globalHotkey.HotkeyPressed -= OnGlobalHotkeyPressed; +- _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; ++ _globalHotkey.QuickSendHotkeyPressed -= OnGlobalQuickSendHotkeyPressed; ++ _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed; ++ _globalHotkey.VoiceToggleHotkeyPressed -= OnGlobalVoiceToggleHotkeyPressed; ++ _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed; + _globalHotkey.Register(); + } + else +@@ -1632,6 +1877,9 @@ private void OnSettingsSaved(object? sender, EventArgs e) + _globalHotkey?.Unregister(); + } + ++ _voiceRepeaterWindow?.RefreshStatus(); ++ _voiceModeWindow?.RefreshStatus(); ++ + // Update auto-start + AutoStartManager.SetAutoStart(_settings.AutoStart); + } +@@ -1640,8 +1888,15 @@ private void ShowWebChat() + { + if (_webChatWindow == null || _webChatWindow.IsClosed) + { +- _webChatWindow = new WebChatWindow(_settings!.GatewayUrl, _settings.Token); +- _webChatWindow.Closed += (s, e) => _webChatWindow = null; ++ _webChatWindow = new WebChatWindow( ++ _settings!.GatewayUrl, ++ _settings.Token); ++ _webChatWindow.Closed += (s, e) => ++ { ++ _voiceChatCoordinator?.DetachWindow(_webChatWindow); ++ _webChatWindow = null; ++ }; ++ _voiceChatCoordinator?.AttachWindow(_webChatWindow); + } + _webChatWindow.Activate(); + } +@@ -1844,7 +2099,7 @@ private void OpenLogFile() + } + } + +- private void OnGlobalHotkeyPressed(object? sender, EventArgs e) ++ private void OnGlobalQuickSendHotkeyPressed(object? sender, EventArgs e) + { + // Hotkey events are raised from a dedicated Win32 message-loop thread. + // Creating/activating WinUI windows must happen on the app's UI thread. +@@ -1861,6 +2116,137 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e) + } + } + ++ private void OnGlobalVoiceToggleHotkeyPressed(object? sender, EventArgs e) ++ { ++ if (_dispatcherQueue == null) ++ { ++ Logger.Warn("Voice hotkey pressed but DispatcherQueue is null"); ++ return; ++ } ++ ++ var enqueued = _dispatcherQueue.TryEnqueue(async () => await ToggleVoiceQuickPauseAsync()); ++ if (!enqueued) ++ { ++ Logger.Warn("Voice hotkey pressed but failed to enqueue Voice quick pause on UI thread"); ++ } ++ } ++ ++ private async Task ToggleVoiceQuickPauseAsync() ++ { ++ if (_voiceService == null) ++ { ++ return; ++ } ++ ++ if (_settings?.EnableNodeMode != true) ++ { ++ Logger.Warn("Voice quick pause blocked: Node Mode is disabled"); ++ return; ++ } ++ ++ if (!CanQuickToggleVoiceMode()) ++ { ++ Logger.Warn("Voice quick pause blocked: Voice Mode is off"); ++ return; ++ } ++ ++ try ++ { ++ var status = await _voiceService.ToggleQuickPauseAsync(); ++ _voiceRepeaterWindow?.RefreshStatus(); ++ _voiceModeWindow?.RefreshStatus(); ++ ShowVoiceQuickToggleToast(status); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Voice quick pause failed: {ex.Message}"); ++ } ++ } ++ ++ private static void ShowVoiceQuickToggleToast(VoiceStatusInfo status) ++ { ++ try ++ { ++ var title = status.State == VoiceRuntimeState.Paused ++ ? "Voice paused" ++ : "Voice resumed"; ++ var detail = status.State == VoiceRuntimeState.Paused ++ ? $"{status.Mode} is paused. Press Ctrl+Alt+Shift+V to resume." ++ : $"{status.Mode} is active again."; ++ ++ new ToastContentBuilder() ++ .AddText(title) ++ .AddText(detail) ++ .Show(); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Failed to show voice pause toast: {ex.Message}"); ++ } ++ } ++ ++ private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args) ++ { ++ if (_dispatcherQueue == null) ++ { ++ return; ++ } ++ ++ _dispatcherQueue.TryEnqueue(() => ShowVoiceConversationToast(args)); ++ } ++ ++ private void ShowVoiceConversationToast(VoiceConversationTurnEventArgs args) ++ { ++ if (_settings?.Voice.ShowConversationToasts != true) ++ { ++ return; ++ } ++ ++ var title = args.Direction == VoiceConversationDirection.Outgoing ++ ? "Voice heard" ++ : "Voice reply"; ++ ++ AddRecentActivity( ++ $"voice: {title}", ++ category: "voice", ++ details: args.Message, ++ dashboardPath: "chat", ++ sessionKey: args.SessionKey); ++ ++ NotificationHistoryService.AddNotification(new Services.GatewayNotification ++ { ++ Title = title, ++ Message = args.Message, ++ Category = "voice" ++ }); ++ ++ if (_settings.ShowNotifications != true) ++ { ++ return; ++ } ++ ++ try ++ { ++ var builder = new ToastContentBuilder() ++ .AddText(title) ++ .AddText(args.Message); ++ ++ if (args.Direction == VoiceConversationDirection.Incoming) ++ { ++ builder.AddArgument("action", "open_chat") ++ .AddButton(new ToastButton() ++ .SetContent("Open Chat") ++ .AddArgument("action", "open_chat")); ++ } ++ ++ builder.Show(); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Failed to show voice conversation toast: {ex.Message}"); ++ } ++ } ++ + #endregion + + #region Updates +@@ -2047,6 +2433,15 @@ private void OnToastActivated(ToastNotificationActivatedEventArgsCompat args) + private void ExitApplication() + { + Logger.Info("Application exiting"); ++ ++ TryCloseWindow(_voiceRepeaterWindow); ++ TryCloseWindow(_voiceModeWindow); ++ TryCloseWindow(_webChatWindow); ++ TryCloseWindow(_settingsWindow); ++ TryCloseWindow(_statusDetailWindow); ++ TryCloseWindow(_notificationHistoryWindow); ++ TryCloseWindow(_activityStreamWindow); ++ TryCloseWindow(_quickSendDialog); + + // Cancel background tasks + _deepLinkCts?.Cancel(); +@@ -2056,6 +2451,7 @@ private void ExitApplication() + _healthCheckTimer?.Dispose(); + _sessionPollTimer?.Stop(); + _sessionPollTimer?.Dispose(); ++ _voiceTrayIconTimer?.Stop(); + + // Cleanup hotkey + _globalHotkey?.Dispose(); +@@ -2070,10 +2466,32 @@ private void ExitApplication() + + // Dispose cancellation token source + _deepLinkCts?.Dispose(); ++ if (_voiceChatCoordinator != null) ++ { ++ _voiceChatCoordinator.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable; ++ _voiceChatCoordinator.Dispose(); ++ } ++ _voiceService?.Dispose(); + + Exit(); + } + ++ private static void TryCloseWindow(Window? window) ++ { ++ if (window == null) ++ { ++ return; ++ } ++ ++ try ++ { ++ window.Close(); ++ } ++ catch ++ { ++ } ++ } ++ + #endregion + + private Microsoft.UI.Dispatching.DispatcherQueue? AppDispatcherQueue => +diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png +new file mode 100644 +index 0000000..04c239b +Binary files /dev/null and b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png differ +diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json +new file mode 100644 +index 0000000..3ffcc0b +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json +@@ -0,0 +1,274 @@ ++{ ++ "speechToTextProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Recognition", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed." ++ }, ++ { ++ "id": "http-ws", ++ "name": "http/ws", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": true, ++ "selectable": false, ++ "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming." ++ }, ++ { ++ "id": "foundry-local", ++ "name": "Foundry Local", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": false, ++ "selectable": false, ++ "description": "AudioGraph-fed streaming STT route for Foundry Local or compatible streaming adapters.", ++ "settings": [ ++ { ++ "key": "endpoint", ++ "label": "Endpoint", ++ "required": false, ++ "defaultValue": "http://localhost:5273", ++ "placeholder": "http://localhost:5273", ++ "description": "Local Foundry-compatible transcription endpoint for the AudioGraph streaming STT route." ++ }, ++ { ++ "key": "model", ++ "label": "Model", ++ "required": false, ++ "defaultValue": "whisper-tiny", ++ "placeholder": "whisper-tiny", ++ "description": "Transcription model identifier for the streaming STT adapter." ++ } ++ ] ++ }, ++ { ++ "id": "openai-whisper", ++ "name": "OpenAI Whisper", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": false, ++ "selectable": false, ++ "description": "AudioGraph-fed cloud STT route for the OpenAI Whisper transcription API.", ++ "settings": [ ++ { ++ "key": "apiKey", ++ "label": "API key", ++ "secret": true ++ }, ++ { ++ "key": "model", ++ "label": "Model", ++ "required": false, ++ "defaultValue": "whisper-1", ++ "placeholder": "whisper-1", ++ "description": "Transcription model identifier for the OpenAI speech-to-text adapter." ++ } ++ ] ++ }, ++ { ++ "id": "elevenlabs-stt", ++ "name": "ElevenLabs Speech to Text", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": false, ++ "selectable": false, ++ "description": "AudioGraph-fed cloud STT route for the ElevenLabs speech-to-text API.", ++ "settings": [ ++ { ++ "key": "apiKey", ++ "label": "API key", ++ "secret": true ++ }, ++ { ++ "key": "model", ++ "label": "Model", ++ "required": false, ++ "defaultValue": "scribe_v1", ++ "placeholder": "scribe_v1", ++ "description": "Transcription model identifier for the ElevenLabs speech-to-text adapter." ++ } ++ ] ++ }, ++ { ++ "id": "azure-ai-speech", ++ "name": "Azure AI Speech", ++ "runtime": "streaming", ++ "enabled": false, ++ "visibleInSettings": false, ++ "selectable": false, ++ "description": "AudioGraph-fed cloud STT route for Azure AI Speech real-time transcription.", ++ "settings": [ ++ { ++ "key": "apiKey", ++ "label": "API key", ++ "secret": true ++ }, ++ { ++ "key": "endpoint", ++ "label": "Endpoint", ++ "required": false, ++ "defaultValue": "", ++ "placeholder": "https://your-speech-resource.cognitiveservices.azure.com", ++ "description": "Azure AI Speech endpoint for the streaming STT adapter." ++ } ++ ] ++ }, ++ { ++ "id": "sherpa-onnx", ++ "name": "sherpa-onnx", ++ "runtime": "embedded", ++ "enabled": false, ++ "visibleInSettings": true, ++ "selectable": false, ++ "description": "Can load a variety of models including OpenAI/Whisper, full-duplex, streaming.", ++ "settings": [ ++ { ++ "key": "modelPath", ++ "label": "Model path", ++ "required": false, ++ "defaultValue": "", ++ "placeholder": "C:\\models\\sherpa-onnx\\model.onnx", ++ "description": "Path to the downloaded sherpa-onnx model bundle the embedded STT route should use." ++ }, ++ { ++ "key": "model", ++ "label": "Model preset", ++ "required": false, ++ "defaultValue": "", ++ "placeholder": "tiny / base / small / medium", ++ "description": "Optional human-readable model preset to help track which local bundle is selected." ++ } ++ ] ++ } ++ ], ++ "textToSpeechProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Synthesis", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows text-to-speech playback." ++ }, ++ { ++ "id": "minimax", ++ "name": "MiniMax", ++ "runtime": "cloud", ++ "enabled": true, ++ "description": "Cloud TTS using the MiniMax HTTP text-to-speech API.", ++ "settings": [ ++ { ++ "key": "apiKey", ++ "label": "API key", ++ "secret": true ++ }, ++ { ++ "key": "model", ++ "label": "Model", ++ "defaultValue": "speech-2.8-turbo", ++ "options": [ ++ "speech-2.5-turbo-preview", ++ "speech-02-turbo", ++ "speech-02-hd", ++ "speech-2.6-turbo", ++ "speech-2.6-hd", ++ "speech-2.8-turbo", ++ "speech-2.8-hd" ++ ] ++ }, ++ { ++ "key": "voiceId", ++ "label": "Voice ID", ++ "required": false, ++ "defaultValue": "English_MatureBoss" ++ }, ++ { ++ "key": "voiceSettingsJson", ++ "label": "Voice settings JSON", ++ "required": false, ++ "jsonValue": true, ++ "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", ++ "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", ++ "description": "Optional full MiniMax request fragment. If present, it controls the full voice_setting payload." ++ } ++ ], ++ "textToSpeechWebSocket": { ++ "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2", ++ "authenticationHeaderName": "Authorization", ++ "authenticationScheme": "Bearer", ++ "apiKeySettingKey": "apiKey", ++ "connectSuccessEventName": "connected_success", ++ "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }", ++ "startSuccessEventName": "task_started", ++ "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }", ++ "finishMessageTemplate": "{ \"event\": \"task_finish\" }", ++ "responseAudioMode": "hexJsonString", ++ "responseAudioJsonPath": "data.audio", ++ "responseStatusCodeJsonPath": "base_resp.status_code", ++ "responseStatusMessageJsonPath": "base_resp.status_msg", ++ "finalFlagJsonPath": "is_final", ++ "taskFailedEventName": "task_failed", ++ "successStatusValue": "0", ++ "outputContentType": "audio/mpeg" ++ } ++ }, ++ { ++ "id": "elevenlabs", ++ "name": "ElevenLabs", ++ "runtime": "cloud", ++ "enabled": true, ++ "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.", ++ "settings": [ ++ { ++ "key": "apiKey", ++ "label": "API key", ++ "secret": true ++ }, ++ { ++ "key": "model", ++ "label": "Model", ++ "defaultValue": "eleven_multilingual_v2", ++ "options": [ ++ "eleven_flash_v2_5", ++ "eleven_turbo_v2_5", ++ "eleven_multilingual_v2", ++ "eleven_monolingual_v1" ++ ] ++ }, ++ { ++ "key": "voiceId", ++ "label": "Voice ID", ++ "required": false, ++ "defaultValue": "6aDn1KB0hjpdcocrUkmq", ++ "placeholder": "Enter an ElevenLabs voice ID" ++ }, ++ { ++ "key": "voiceSettingsJson", ++ "label": "Voice settings JSON", ++ "required": false, ++ "jsonValue": true, ++ "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", ++ "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", ++ "description": "Optional full ElevenLabs request fragment. If present, it controls the full voice_settings payload." ++ } ++ ], ++ "textToSpeechWebSocket": { ++ "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", ++ "authenticationHeaderName": "xi-api-key", ++ "authenticationScheme": "", ++ "apiKeySettingKey": "apiKey", ++ "connectSuccessEventName": "", ++ "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }", ++ "startSuccessEventName": "", ++ "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }", ++ "finishMessageTemplate": "{ \"text\": \"\" }", ++ "responseAudioMode": "base64JsonString", ++ "responseAudioJsonPath": "audio", ++ "finalFlagJsonPath": "isFinal", ++ "taskFailedEventName": "error", ++ "outputContentType": "audio/mpeg" ++ } ++ } ++ ] ++} +diff --git a/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml +new file mode 100644 +index 0000000..cadffd4 +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml +@@ -0,0 +1,111 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs +new file mode 100644 +index 0000000..017a9cf +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs +@@ -0,0 +1,563 @@ ++using Microsoft.UI.Windowing; ++using Microsoft.UI.Dispatching; ++using Microsoft.UI.Xaml; ++using Microsoft.UI.Xaml.Controls; ++using OpenClaw.Shared; ++using OpenClawTray.Helpers; ++using OpenClawTray.Services; ++using OpenClawTray.Services.Voice; ++using System; ++using System.Collections.ObjectModel; ++using System.ComponentModel; ++using System.Runtime.CompilerServices; ++using System.Threading.Tasks; ++using Windows.Graphics; ++using WinUIEx; ++ ++namespace OpenClawTray.Windows; ++ ++public sealed partial class VoiceRepeaterWindow : WindowEx, IVoiceChatWindow ++{ ++ private const int MaxConversationItems = 24; ++ private const int DefaultWidth = 360; ++ private const int DefaultHeight = 170; ++ private const int DefaultMargin = 12; ++ private const double DefaultTextSize = 13; ++ private const double DefaultCaptionSize = 10; ++ ++ private readonly SettingsManager _settings; ++ private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi; ++ private readonly ObservableCollection _conversationItems = []; ++ private readonly DispatcherQueueTimer? _refreshTimer; ++ private readonly DispatcherQueueTimer? _layoutSaveTimer; ++ ++ private bool _controlActionInFlight; ++ private bool _suppressSettingsEvents; ++ private bool _suppressPlacementSave = true; ++ private bool _initialPlacementPending = true; ++ private bool _placementDirty; ++ private bool _autoScrollEnabled; ++ private double _messageFontSize = DefaultTextSize; ++ private double _captionFontSize = DefaultCaptionSize; ++ ++ public bool IsClosed { get; private set; } ++ ++ public event EventHandler? OpenVoiceStatusRequested; ++ ++ public VoiceRepeaterWindow( ++ SettingsManager settings, ++ IVoiceRuntimeControlApi voiceRuntimeControlApi) ++ { ++ _settings = settings; ++ _voiceRuntimeControlApi = voiceRuntimeControlApi; ++ _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll; ++ ++ InitializeComponent(); ++ ++ Title = "Voice Mode"; ++ ApplyStoredWindowPlacement(); ++ this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected)); ++ ++ ConversationItemsControl.ItemsSource = _conversationItems; ++ ++ Closed += OnWindowClosed; ++ Activated += OnWindowActivated; ++ ++ var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); ++ if (dispatcherQueue != null) ++ { ++ _refreshTimer = dispatcherQueue.CreateTimer(); ++ _refreshTimer.Interval = TimeSpan.FromMilliseconds(400); ++ _refreshTimer.Tick += (_, _) => RefreshStatus(); ++ _refreshTimer.Start(); ++ ++ _layoutSaveTimer = dispatcherQueue.CreateTimer(); ++ _layoutSaveTimer.Interval = TimeSpan.FromMilliseconds(600); ++ _layoutSaveTimer.IsRepeating = false; ++ _layoutSaveTimer.Tick += (_, _) => ++ { ++ _layoutSaveTimer.Stop(); ++ SaveWindowPlacement(); ++ }; ++ } ++ ++ if (AppWindow is not null) ++ { ++ AppWindow.Changed += OnAppWindowChanged; ++ } ++ ++ ApplyViewSettings(); ++ RefreshStatus(); ++ UpdateConversationPlaceholder(); ++ } ++ ++ public void RefreshStatus() ++ { ++ var status = _voiceRuntimeControlApi.CurrentStatus; ++ ApplyStatus(status); ++ } ++ ++ public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) ++ { ++ var draftText = clear ? string.Empty : (text ?? string.Empty); ++ DraftTextBlock.Text = draftText; ++ DraftPanel.Visibility = string.IsNullOrWhiteSpace(draftText) ++ ? Visibility.Collapsed ++ : Visibility.Visible; ++ ++ UpdateConversationPlaceholder(); ++ ScrollConversationToEnd(); ++ return Task.CompletedTask; ++ } ++ ++ public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) ++ { ++ if (args == null || string.IsNullOrWhiteSpace(args.Message)) ++ { ++ return Task.CompletedTask; ++ } ++ ++ var item = new ConversationItem( ++ args.Direction == VoiceConversationDirection.Outgoing ? "You" : "Assistant", ++ DateTime.Now.ToString("HH:mm:ss"), ++ args.Message, ++ _messageFontSize, ++ _captionFontSize); ++ ++ _conversationItems.Add(item); ++ while (_conversationItems.Count > MaxConversationItems) ++ { ++ _conversationItems.RemoveAt(0); ++ } ++ ++ UpdateConversationPlaceholder(); ++ ScrollConversationToEnd(); ++ return Task.CompletedTask; ++ } ++ ++ private async void OnPauseResumeClick(object sender, RoutedEventArgs e) ++ { ++ if (_controlActionInFlight) ++ { ++ return; ++ } ++ ++ _controlActionInFlight = true; ++ ApplyStatus(_voiceRuntimeControlApi.CurrentStatus); ++ ++ try ++ { ++ var status = _voiceRuntimeControlApi.CurrentStatus; ++ if (status.State == VoiceRuntimeState.Paused) ++ { ++ await _voiceRuntimeControlApi.ResumeAsync(new VoiceResumeArgs { Reason = "Voice repeater resume button" }); ++ } ++ else ++ { ++ await _voiceRuntimeControlApi.PauseAsync(new VoicePauseArgs { Reason = "Voice repeater pause button" }); ++ } ++ } ++ finally ++ { ++ _controlActionInFlight = false; ++ RefreshStatus(); ++ } ++ } ++ ++ private async void OnSkipReplyClick(object sender, RoutedEventArgs e) ++ { ++ if (_controlActionInFlight || !_voiceRuntimeControlApi.CurrentStatus.CanSkipReply) ++ { ++ return; ++ } ++ ++ _controlActionInFlight = true; ++ ApplyStatus(_voiceRuntimeControlApi.CurrentStatus); ++ ++ try ++ { ++ await _voiceRuntimeControlApi.SkipCurrentReplyAsync(new VoiceSkipArgs ++ { ++ Reason = "Voice repeater skip button" ++ }); ++ } ++ finally ++ { ++ _controlActionInFlight = false; ++ RefreshStatus(); ++ } ++ } ++ ++ private void OnAutoScrollChanged(object sender, RoutedEventArgs e) ++ { ++ if (_suppressSettingsEvents) ++ { ++ return; ++ } ++ ++ _autoScrollEnabled = AutoScrollCheckBox.IsChecked == true; ++ _settings.VoiceRepeaterWindow.AutoScroll = _autoScrollEnabled; ++ _settings.Save(logSuccess: false); ++ ++ if (_autoScrollEnabled) ++ { ++ ScrollConversationToEnd(); ++ } ++ } ++ ++ private void OnTextSizeSelectionChanged(object sender, SelectionChangedEventArgs e) ++ { ++ if (_suppressSettingsEvents || TextSizeComboBox.SelectedItem is not ComboBoxItem item) ++ { ++ return; ++ } ++ ++ if (!double.TryParse(item.Tag?.ToString(), out var size)) ++ { ++ return; ++ } ++ ++ _settings.VoiceRepeaterWindow.TextSize = size; ++ ApplyViewSettings(); ++ _settings.Save(logSuccess: false); ++ } ++ ++ private void OnFloatingEnabledChanged(object sender, RoutedEventArgs e) ++ { ++ if (_suppressSettingsEvents) ++ { ++ return; ++ } ++ ++ var enabled = FloatingEnabledCheckBox.IsChecked == true; ++ _settings.VoiceRepeaterWindow.FloatingEnabled = enabled; ++ IsAlwaysOnTop = enabled; ++ _settings.Save(logSuccess: false); ++ } ++ ++ private void OnOpenVoiceStatusClick(object sender, RoutedEventArgs e) ++ { ++ OpenVoiceStatusRequested?.Invoke(this, EventArgs.Empty); ++ } ++ ++ private void OnWindowClosed(object sender, WindowEventArgs e) ++ { ++ if (_refreshTimer != null) ++ { ++ _refreshTimer.Stop(); ++ } ++ ++ if (_layoutSaveTimer != null) ++ { ++ _layoutSaveTimer.Stop(); ++ } ++ ++ if (AppWindow is not null) ++ { ++ AppWindow.Changed -= OnAppWindowChanged; ++ } ++ ++ Activated -= OnWindowActivated; ++ FlushWindowPlacement(); ++ IsClosed = true; ++ } ++ ++ private void OnWindowActivated(object sender, WindowActivatedEventArgs args) ++ { ++ if (!_initialPlacementPending) ++ { ++ return; ++ } ++ ++ _initialPlacementPending = false; ++ ApplyStoredWindowPlacement(); ++ ++ var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); ++ _ = dispatcherQueue?.TryEnqueue(() => _suppressPlacementSave = false); ++ } ++ ++ private void OnAppWindowChanged(AppWindow sender, AppWindowChangedEventArgs args) ++ { ++ if (_suppressPlacementSave) ++ { ++ return; ++ } ++ ++ if (args.DidPositionChange || args.DidSizeChange) ++ { ++ _placementDirty = true; ++ _layoutSaveTimer?.Stop(); ++ _layoutSaveTimer?.Start(); ++ } ++ } ++ ++ private void ApplyStatus(VoiceStatusInfo status) ++ { ++ Title = $"Voice Mode ({GetWindowStateLabel(status)})"; ++ DraftCaptionTextBlock.Text = status.State == VoiceRuntimeState.RecordingUtterance ++ ? "You (speaking)" ++ : "You (draft)"; ++ ++ if (string.IsNullOrWhiteSpace(status.LastError)) ++ { ++ TroubleshootingTextBlock.Visibility = Visibility.Collapsed; ++ TroubleshootingTextBlock.Text = string.Empty; ++ } ++ else ++ { ++ TroubleshootingTextBlock.Visibility = Visibility.Visible; ++ TroubleshootingTextBlock.Text = status.LastError; ++ } ++ ++ var paused = status.State == VoiceRuntimeState.Paused; ++ PauseResumeButton.IsEnabled = !_controlActionInFlight && status.Mode != VoiceActivationMode.Off; ++ PauseResumeIcon.Symbol = paused ? Symbol.Play : Symbol.Pause; ++ ToolTipService.SetToolTip( ++ PauseResumeButton, ++ paused ? "Resume voice mode" : "Pause voice mode"); ++ ++ SkipReplyButton.IsEnabled = !_controlActionInFlight && status.CanSkipReply; ++ } ++ ++ private void ApplyStoredWindowPlacement() ++ { ++ if (AppWindow is null) ++ { ++ return; ++ } ++ ++ var prefs = _settings.VoiceRepeaterWindow; ++ var width = prefs.HasSavedPlacement ++ ? prefs.Width.GetValueOrDefault(DefaultWidth) ++ : DefaultWidth; ++ var height = prefs.HasSavedPlacement ++ ? prefs.Height.GetValueOrDefault(DefaultHeight) ++ : DefaultHeight; ++ var clampedWidth = Math.Max(width, 320); ++ var clampedHeight = Math.Max(height, 150); ++ ++ IsAlwaysOnTop = prefs.FloatingEnabled; ++ ++ var targetRect = prefs.HasSavedPlacement && prefs.X.HasValue && prefs.Y.HasValue ++ ? new RectInt32(prefs.X.Value, prefs.Y.Value, clampedWidth, clampedHeight) ++ : GetDefaultAnchorRect(clampedWidth, clampedHeight); ++ ++ if (!IsPlacementVisible(targetRect)) ++ { ++ targetRect = GetDefaultAnchorRect(clampedWidth, clampedHeight); ++ } ++ ++ try ++ { ++ AppWindow.MoveAndResize(targetRect); ++ } ++ catch ++ { ++ this.SetWindowSize(targetRect.Width, targetRect.Height); ++ AppWindow.Move(new PointInt32(targetRect.X, targetRect.Y)); ++ } ++ } ++ ++ private void ApplyViewSettings() ++ { ++ _suppressSettingsEvents = true; ++ try ++ { ++ _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll; ++ _messageFontSize = Math.Clamp( ++ _settings.VoiceRepeaterWindow.TextSize > 0 ? _settings.VoiceRepeaterWindow.TextSize : DefaultTextSize, ++ 11, ++ 15); ++ _captionFontSize = Math.Max(9, _messageFontSize - 3); ++ ++ DraftTextBlock.FontSize = _messageFontSize; ++ DraftCaptionTextBlock.FontSize = _captionFontSize; ++ TroubleshootingTextBlock.FontSize = _captionFontSize; ++ ++ foreach (var item in _conversationItems) ++ { ++ item.MessageFontSize = _messageFontSize; ++ item.CaptionFontSize = _captionFontSize; ++ } ++ ++ AutoScrollCheckBox.IsChecked = _autoScrollEnabled; ++ FloatingEnabledCheckBox.IsChecked = _settings.VoiceRepeaterWindow.FloatingEnabled; ++ SelectTextSizeItem(_messageFontSize); ++ } ++ finally ++ { ++ _suppressSettingsEvents = false; ++ } ++ } ++ ++ private void SaveWindowPlacement() ++ { ++ if (IsClosed || AppWindow is null || _suppressPlacementSave) ++ { ++ return; ++ } ++ ++ var size = AppWindow.Size; ++ var position = AppWindow.Position; ++ _settings.VoiceRepeaterWindow.Width = size.Width; ++ _settings.VoiceRepeaterWindow.Height = size.Height; ++ _settings.VoiceRepeaterWindow.X = position.X; ++ _settings.VoiceRepeaterWindow.Y = position.Y; ++ _settings.VoiceRepeaterWindow.HasSavedPlacement = true; ++ _settings.Save(logSuccess: false); ++ _placementDirty = false; ++ } ++ ++ private void FlushWindowPlacement() ++ { ++ if (_placementDirty || !IsClosed) ++ { ++ SaveWindowPlacement(); ++ } ++ } ++ ++ private RectInt32 GetDefaultAnchorRect(int width, int height) ++ { ++ var displayArea = DisplayArea.Primary; ++ var x = displayArea.WorkArea.X + DefaultMargin; ++ var y = displayArea.WorkArea.Y + Math.Max(DefaultMargin, displayArea.WorkArea.Height - height - DefaultMargin); ++ return new RectInt32(x, y, width, height); ++ } ++ ++ private static bool IsPlacementVisible(RectInt32 rect) ++ { ++ try ++ { ++ var displayArea = DisplayArea.GetFromRect(rect, DisplayAreaFallback.Nearest); ++ var workArea = displayArea.WorkArea; ++ return rect.Width > 0 && ++ rect.Height > 0 && ++ rect.X < workArea.X + workArea.Width && ++ rect.X + rect.Width > workArea.X && ++ rect.Y < workArea.Y + workArea.Height && ++ rect.Y + rect.Height > workArea.Y; ++ } ++ catch ++ { ++ return false; ++ } ++ } ++ ++ private void SelectTextSizeItem(double size) ++ { ++ var sizeTag = ((int)Math.Round(size)).ToString(); ++ foreach (var entry in TextSizeComboBox.Items) ++ { ++ if (entry is ComboBoxItem item && string.Equals(item.Tag?.ToString(), sizeTag, StringComparison.Ordinal)) ++ { ++ TextSizeComboBox.SelectedItem = item; ++ return; ++ } ++ } ++ ++ TextSizeComboBox.SelectedIndex = 2; ++ } ++ ++ private void UpdateConversationPlaceholder() ++ { ++ EmptyConversationTextBlock.Visibility = _conversationItems.Count == 0 && DraftPanel.Visibility != Visibility.Visible ++ ? Visibility.Visible ++ : Visibility.Collapsed; ++ } ++ ++ private void ScrollConversationToEnd() ++ { ++ if (!_autoScrollEnabled) ++ { ++ return; ++ } ++ ++ var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); ++ _ = dispatcherQueue?.TryEnqueue(() => ++ { ++ ConversationScrollViewer.UpdateLayout(); ++ ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true); ++ _ = dispatcherQueue.TryEnqueue(() => ++ ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true)); ++ }); ++ } ++ ++ private static string GetWindowStateLabel(VoiceStatusInfo status) ++ { ++ return status.State switch ++ { ++ VoiceRuntimeState.ListeningForVoiceWake => "listening", ++ VoiceRuntimeState.ListeningContinuously => "listening", ++ VoiceRuntimeState.RecordingUtterance => "hearing you", ++ VoiceRuntimeState.AwaitingResponse => "waiting", ++ VoiceRuntimeState.PlayingResponse => "speaking", ++ VoiceRuntimeState.Paused => "paused", ++ VoiceRuntimeState.Arming => "starting", ++ VoiceRuntimeState.Error => "error", ++ _ when status.Mode == VoiceActivationMode.Off => "off", ++ _ => "idle" ++ }; ++ } ++ ++ private sealed class ConversationItem : INotifyPropertyChanged ++ { ++ private double _messageFontSize; ++ private double _captionFontSize; ++ ++ public ConversationItem( ++ string speaker, ++ string timestamp, ++ string message, ++ double messageFontSize, ++ double captionFontSize) ++ { ++ Speaker = speaker; ++ Timestamp = timestamp; ++ Message = message; ++ _messageFontSize = messageFontSize; ++ _captionFontSize = captionFontSize; ++ } ++ ++ public string Speaker { get; } ++ public string Timestamp { get; } ++ public string Message { get; } ++ public string Caption => $"{Speaker} · {Timestamp}"; ++ ++ public double MessageFontSize ++ { ++ get => _messageFontSize; ++ set ++ { ++ if (Math.Abs(_messageFontSize - value) < 0.01) ++ { ++ return; ++ } ++ ++ _messageFontSize = value; ++ OnPropertyChanged(); ++ } ++ } ++ ++ public double CaptionFontSize ++ { ++ get => _captionFontSize; ++ set ++ { ++ if (Math.Abs(_captionFontSize - value) < 0.01) ++ { ++ return; ++ } ++ ++ _captionFontSize = value; ++ OnPropertyChanged(); ++ } ++ } ++ ++ public event PropertyChangedEventHandler? PropertyChanged; ++ ++ private void OnPropertyChanged([CallerMemberName] string? propertyName = null) ++ { ++ PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName)); ++ } ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomBridge.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomBridge.cs +new file mode 100644 +index 0000000..fbe5845 +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomBridge.cs +@@ -0,0 +1,95 @@ ++using System.Text.Json; ++ ++namespace OpenClawTray.Windows; ++ ++internal static class WebChatVoiceDomBridge ++{ ++ public const string DocumentCreatedScript = """ ++(() => { ++ const isVisible = (el) => !!el && !(el.disabled === true) && el.getClientRects().length > 0; ++ let desiredDraft = ''; ++ ++ const findComposer = () => { ++ const candidates = Array.from(document.querySelectorAll('textarea, input[type="text"], [contenteditable="true"], [contenteditable="plaintext-only"]')); ++ return candidates.find(isVisible) || null; ++ }; ++ ++ const setElementValue = (el, value) => { ++ const text = typeof value === 'string' ? value : ''; ++ if ('value' in el) { ++ const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype; ++ const descriptor = Object.getOwnPropertyDescriptor(proto, 'value'); ++ if (descriptor && descriptor.set) { ++ descriptor.set.call(el, text); ++ } else { ++ el.value = text; ++ } ++ el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' })); ++ el.dispatchEvent(new Event('change', { bubbles: true })); ++ return; ++ } ++ ++ if (el.isContentEditable) { ++ el.textContent = text; ++ el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' })); ++ el.dispatchEvent(new Event('change', { bubbles: true })); ++ } ++ }; ++ ++ const applyDraftIfPossible = () => { ++ const composer = findComposer(); ++ if (!composer) return false; ++ setElementValue(composer, desiredDraft); ++ return true; ++ }; ++ ++ const clearLegacyTurnsHost = () => { ++ const host = document.getElementById('openclaw-tray-voice-turns'); ++ if (host) { ++ host.remove(); ++ } ++ }; ++ ++ const observer = new MutationObserver(() => applyDraftIfPossible()); ++ const start = () => { ++ if (!document.body) return; ++ observer.observe(document.body, { childList: true, subtree: true }); ++ applyDraftIfPossible(); ++ clearLegacyTurnsHost(); ++ }; ++ ++ if (document.readyState === 'loading') { ++ document.addEventListener('DOMContentLoaded', start, { once: true }); ++ } else { ++ start(); ++ } ++ ++ window.__openClawTrayVoice = { ++ setDraft(text) { ++ desiredDraft = text || ''; ++ return applyDraftIfPossible(); ++ }, ++ clearDraft() { ++ desiredDraft = ''; ++ return applyDraftIfPossible(); ++ }, ++ setTurns() { ++ clearLegacyTurnsHost(); ++ return true; ++ } ++ }; ++})(); ++"""; ++ ++ public static string BuildSetDraftScript(string? text) ++ { ++ if (string.IsNullOrWhiteSpace(text)) ++ { ++ return "window.__openClawTrayVoice?.clearDraft?.();"; ++ } ++ ++ return $"window.__openClawTrayVoice?.setDraft?.({JsonSerializer.Serialize(text)});"; ++ } ++ ++ public const string ClearLegacyTurnsScript = "window.__openClawTrayVoice?.setTurns?.([]);"; ++} +diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs +new file mode 100644 +index 0000000..59cdcfe +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs +@@ -0,0 +1,15 @@ ++namespace OpenClawTray.Windows; ++ ++internal sealed class WebChatVoiceDomState ++{ ++ public WebChatVoiceDomState() ++ { ++ } ++ ++ public string PendingDraft { get; private set; } = string.Empty; ++ ++ public void SetDraft(string? text, bool clear) ++ { ++ PendingDraft = clear ? string.Empty : (text ?? string.Empty); ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs +index 8a6bc4b..1b0f000 100644 +--- a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs +@@ -3,6 +3,7 @@ + using OpenClaw.Shared; + using OpenClawTray.Helpers; + using OpenClawTray.Services; ++using OpenClawTray.Services.Voice; + using System; + using System.Diagnostics; + using System.IO; +@@ -14,14 +15,16 @@ + namespace OpenClawTray.Windows; + + public sealed partial class WebChatWindow : WindowEx ++ , IVoiceChatWindow + { + private readonly string _gatewayUrl; + private readonly string _token; +- +- // Store event handlers for cleanup ++ private readonly WebChatVoiceDomState _voiceDomState; ++ private bool _voiceDomReady; ++ + private TypedEventHandler? _navigationCompletedHandler; + private TypedEventHandler? _navigationStartingHandler; +- ++ + public bool IsClosed { get; private set; } + + public WebChatWindow(string gatewayUrl, string token) +@@ -29,18 +32,18 @@ public WebChatWindow(string gatewayUrl, string token) + Logger.Info($"WebChatWindow: Constructor called, gateway={gatewayUrl}"); + _gatewayUrl = gatewayUrl; + _token = token; +- ++ _voiceDomState = new WebChatVoiceDomState(); ++ + InitializeComponent(); +- +- // Window configuration ++ + this.SetWindowSize(520, 750); + this.MinWidth = 380; + this.MinHeight = 450; + this.CenterOnScreen(); + this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected)); +- ++ + Closed += OnWindowClosed; +- ++ + Logger.Info("WebChatWindow: Starting InitializeWebViewAsync"); + _ = InitializeWebViewAsync(); + } +@@ -48,8 +51,8 @@ public WebChatWindow(string gatewayUrl, string token) + private void OnWindowClosed(object sender, WindowEventArgs e) + { + IsClosed = true; +- +- // Cleanup WebView2 event handlers ++ _voiceDomReady = false; ++ + if (WebView.CoreWebView2 != null) + { + if (_navigationCompletedHandler != null) +@@ -64,35 +67,39 @@ private async Task InitializeWebViewAsync() + try + { + Logger.Info("WebChatWindow: Initializing WebView2..."); +- +- // Set up user data folder for WebView2 ++ + var userDataFolder = Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), + "OpenClawTray", "WebView2"); +- ++ + Directory.CreateDirectory(userDataFolder); + Logger.Info($"WebChatWindow: User data folder: {userDataFolder}"); + +- // Set environment variable for user data folder + Environment.SetEnvironmentVariable("WEBVIEW2_USER_DATA_FOLDER", userDataFolder); +- ++ + Logger.Info("WebChatWindow: Calling EnsureCoreWebView2Async..."); + await WebView.EnsureCoreWebView2Async(); + Logger.Info("WebChatWindow: CoreWebView2 initialized successfully"); +- +- // Configure WebView2 ++ + WebView.CoreWebView2.Settings.IsStatusBarEnabled = false; + WebView.CoreWebView2.Settings.AreDefaultContextMenusEnabled = true; + WebView.CoreWebView2.Settings.IsZoomControlEnabled = true; ++ await WebView.CoreWebView2.AddScriptToExecuteOnDocumentCreatedAsync(WebChatVoiceDomBridge.DocumentCreatedScript); ++ ++ _voiceDomReady = false; + +- // Handle navigation events (store for cleanup) + _navigationCompletedHandler = (s, e) => + { + Logger.Info($"WebChatWindow: Navigation completed, success={e.IsSuccess}, status={e.WebErrorStatus}"); + LoadingRing.IsActive = false; + LoadingRing.Visibility = Visibility.Collapsed; +- +- // Show friendly error if connection failed ++ _voiceDomReady = e.IsSuccess; ++ ++ if (e.IsSuccess) ++ { ++ _ = RefreshTrayVoiceDomStateAsync(); ++ } ++ + if (!e.IsSuccess && (e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionAborted || + e.WebErrorStatus == CoreWebView2WebErrorStatus.CannotConnect || + e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionReset || +@@ -115,15 +122,14 @@ private async Task InitializeWebViewAsync() + + _navigationStartingHandler = (s, e) => + { +- // Strip query params to avoid logging tokens + var safeUri = e.Uri?.Split('?')[0] ?? "unknown"; + Logger.Info($"WebChatWindow: Navigation starting to {safeUri}"); ++ _voiceDomReady = false; + LoadingRing.IsActive = true; + LoadingRing.Visibility = Visibility.Visible; + }; + WebView.CoreWebView2.NavigationStarting += _navigationStartingHandler; + +- // Navigate to chat + NavigateToChat(); + } + catch (Exception ex) +@@ -135,13 +141,12 @@ private async Task InitializeWebViewAsync() + Logger.Error($"WebView2 inner exception: {ex.InnerException.GetType().FullName}: {ex.InnerException.Message}"); + } + Logger.Error($"WebView2 stack trace: {ex.StackTrace}"); +- +- // Show error in the dialog instead of falling back to browser ++ + LoadingRing.IsActive = false; + LoadingRing.Visibility = Visibility.Collapsed; + WebView.Visibility = Visibility.Collapsed; + ErrorPanel.Visibility = Visibility.Visible; +- ++ + var errorDetails = $"Exception: {ex.GetType().FullName}\n" + + $"HResult: 0x{ex.HResult:X8}\n" + + $"Message: {ex.Message}\n\n" + +@@ -149,17 +154,16 @@ private async Task InitializeWebViewAsync() + $"Architecture: {RuntimeInformation.ProcessArchitecture}\n" + + $"OS: {RuntimeInformation.OSDescription}\n\n" + + $"Stack Trace:\n{ex.StackTrace}"; +- ++ + if (ex.InnerException != null) + { + errorDetails += $"\n\nInner Exception: {ex.InnerException.GetType().FullName}\n{ex.InnerException.Message}"; + } +- ++ + ErrorText.Text = errorDetails; + } + } + +- // Set to a test URL to bypass gateway (e.g., "https://www.bing.com"), or null for normal operation + private const string? DEBUG_TEST_URL = null; + + private static bool IsLocalHost(Uri uri) +@@ -208,12 +212,11 @@ private void ShowErrorMessage(string message) + ErrorPanel.Visibility = Visibility.Visible; + ErrorText.Text = message; + } +- ++ + private void NavigateToChat() + { + if (WebView.CoreWebView2 == null) return; + +- // If debug URL is set, use it instead of gateway + if (!string.IsNullOrEmpty(DEBUG_TEST_URL)) + { + Logger.Info($"WebChatWindow: DEBUG MODE - Navigating to test URL: {DEBUG_TEST_URL}"); +@@ -251,7 +254,7 @@ private void OnPopout(object sender, RoutedEventArgs e) + ShowErrorMessage(errorMessage); + return; + } +- ++ + try + { + Process.Start(new ProcessStartInfo(url) { UseShellExecute = true }); +@@ -266,4 +269,34 @@ private void OnDevTools(object sender, RoutedEventArgs e) + { + WebView.CoreWebView2?.OpenDevToolsWindow(); + } ++ ++ public async Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) ++ { ++ _voiceDomState.SetDraft(text, clear); ++ await RefreshTrayVoiceDomStateAsync(); ++ } ++ ++ public async Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) ++ { ++ await Task.CompletedTask; ++ } ++ ++ private async Task RefreshTrayVoiceDomStateAsync() ++ { ++ if (WebView.CoreWebView2 == null || !_voiceDomReady || IsClosed) ++ { ++ return; ++ } ++ ++ try ++ { ++ await WebView.CoreWebView2.ExecuteScriptAsync( ++ WebChatVoiceDomBridge.BuildSetDraftScript(_voiceDomState.PendingDraft)); ++ await WebView.CoreWebView2.ExecuteScriptAsync(WebChatVoiceDomBridge.ClearLegacyTurnsScript); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"WebChatWindow: Failed to apply voice DOM state: {ex.Message}"); ++ } ++ } + } +diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +index 350cb8d..1bc23ce 100644 +--- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs ++++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +@@ -931,3 +931,234 @@ public async Task Snap_ReturnsError_WhenHandlerThrows() + Assert.Contains("Camera access blocked", res.Error); + } + } ++ ++public class VoiceCapabilityTests ++{ ++ private static JsonElement Parse(string json) ++ { ++ using var doc = JsonDocument.Parse(json); ++ return doc.RootElement.Clone(); ++ } ++ ++ [Fact] ++ public void CanHandle_VoiceCommands() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ Assert.True(cap.CanHandle(VoiceCommands.ListDevices)); ++ Assert.True(cap.CanHandle(VoiceCommands.GetSettings)); ++ Assert.True(cap.CanHandle(VoiceCommands.SetSettings)); ++ Assert.True(cap.CanHandle(VoiceCommands.GetStatus)); ++ Assert.True(cap.CanHandle(VoiceCommands.Start)); ++ Assert.True(cap.CanHandle(VoiceCommands.Stop)); ++ Assert.False(cap.CanHandle("voice.unknown")); ++ Assert.Equal("voice", cap.Category); ++ } ++ ++ [Fact] ++ public async Task ListDevices_ReturnsArrayFromHandler() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ cap.ListDevicesRequested += () => Task.FromResult( ++ [ ++ new VoiceAudioDeviceInfo ++ { ++ DeviceId = "default-input", ++ Name = "System default microphone", ++ IsDefault = true, ++ IsInput = true ++ } ++ ]); ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice1", ++ Command = VoiceCommands.ListDevices, ++ Args = Parse("""{}""") ++ }); ++ ++ Assert.True(res.Ok); ++ var json = JsonSerializer.Serialize(res.Payload); ++ using var doc = JsonDocument.Parse(json); ++ Assert.Equal(JsonValueKind.Array, doc.RootElement.ValueKind); ++ Assert.Equal("default-input", doc.RootElement[0].GetProperty("DeviceId").GetString()); ++ } ++ ++ [Fact] ++ public async Task GetSettings_ReturnsSettingsFromHandler() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ cap.SettingsRequested += () => Task.FromResult(new VoiceSettings ++ { ++ Enabled = true, ++ Mode = VoiceActivationMode.VoiceWake ++ }); ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice2", ++ Command = VoiceCommands.GetSettings, ++ Args = Parse("""{}""") ++ }); ++ ++ Assert.True(res.Ok); ++ var json = JsonSerializer.Serialize(res.Payload); ++ using var doc = JsonDocument.Parse(json); ++ Assert.True(doc.RootElement.GetProperty("Enabled").GetBoolean()); ++ Assert.Equal("VoiceWake", doc.RootElement.GetProperty("Mode").GetString()); ++ } ++ ++ [Fact] ++ public async Task SetSettings_UsesUpdateEnvelope_WhenPresent() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ VoiceSettingsUpdateArgs? received = null; ++ cap.SettingsUpdateRequested += update => ++ { ++ received = update; ++ return Task.FromResult(update.Settings); ++ }; ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice3", ++ Command = VoiceCommands.SetSettings, ++ Args = Parse("""{"update":{"persist":false,"settings":{"enabled":true,"mode":"TalkMode"}}}""") ++ }); ++ ++ Assert.True(res.Ok); ++ Assert.NotNull(received); ++ Assert.False(received!.Persist); ++ Assert.Equal(VoiceActivationMode.TalkMode, received.Settings.Mode); ++ } ++ ++ [Fact] ++ public async Task GetStatus_ReturnsStatusFromHandler() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ cap.StatusRequested += () => Task.FromResult(new VoiceStatusInfo ++ { ++ Available = true, ++ Running = true, ++ Mode = VoiceActivationMode.TalkMode, ++ State = VoiceRuntimeState.ListeningContinuously ++ }); ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice4", ++ Command = VoiceCommands.GetStatus, ++ Args = Parse("""{}""") ++ }); ++ ++ Assert.True(res.Ok); ++ var json = JsonSerializer.Serialize(res.Payload); ++ using var doc = JsonDocument.Parse(json); ++ Assert.True(doc.RootElement.GetProperty("Running").GetBoolean()); ++ Assert.Equal("ListeningContinuously", doc.RootElement.GetProperty("State").GetString()); ++ } ++ ++ [Fact] ++ public async Task Start_PassesArgsToHandler() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ VoiceStartArgs? received = null; ++ cap.StartRequested += args => ++ { ++ received = args; ++ return Task.FromResult(new VoiceStatusInfo ++ { ++ Available = true, ++ Running = true, ++ Mode = args.Mode ?? VoiceActivationMode.Off, ++ State = VoiceRuntimeState.ListeningForVoiceWake, ++ SessionKey = args.SessionKey ++ }); ++ }; ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice5", ++ Command = VoiceCommands.Start, ++ Args = Parse("""{"mode":"VoiceWake","sessionKey":"session-123"}""") ++ }); ++ ++ Assert.True(res.Ok); ++ Assert.NotNull(received); ++ Assert.Equal(VoiceActivationMode.VoiceWake, received!.Mode); ++ Assert.Equal("session-123", received.SessionKey); ++ } ++ ++ [Fact] ++ public async Task Stop_PassesReasonToHandler() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ VoiceStopArgs? received = null; ++ cap.StopRequested += args => ++ { ++ received = args; ++ return Task.FromResult(new VoiceStatusInfo ++ { ++ Available = true, ++ Running = false, ++ Mode = VoiceActivationMode.Off, ++ State = VoiceRuntimeState.Stopped, ++ LastError = args.Reason ++ }); ++ }; ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice6", ++ Command = VoiceCommands.Stop, ++ Args = Parse("""{"reason":"user requested"}""") ++ }); ++ ++ Assert.True(res.Ok); ++ Assert.NotNull(received); ++ Assert.Equal("user requested", received!.Reason); ++ } ++ ++ [Fact] ++ public async Task Start_ReturnsError_WhenHandlerMissing() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice7", ++ Command = VoiceCommands.Start, ++ Args = Parse("""{}""") ++ }); ++ ++ Assert.False(res.Ok); ++ Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase); ++ } ++ ++ [Fact] ++ public async Task LegacyVoiceSkipCommand_RemainsAccepted() ++ { ++ var cap = new VoiceCapability(NullLogger.Instance); ++ VoiceSkipArgs? received = null; ++ cap.SkipRequested += args => ++ { ++ received = args; ++ return Task.FromResult(new VoiceStatusInfo ++ { ++ Available = true, ++ Running = true, ++ Mode = VoiceActivationMode.TalkMode, ++ State = VoiceRuntimeState.PlayingResponse ++ }); ++ }; ++ ++ var res = await cap.ExecuteAsync(new NodeInvokeRequest ++ { ++ Id = "voice8", ++ Command = "voice.skip", ++ Args = Parse("""{"reason":"legacy caller"}""") ++ }); ++ ++ Assert.True(res.Ok); ++ Assert.NotNull(received); ++ Assert.Equal("legacy caller", received!.Reason); ++ } ++} +diff --git a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs +index 424182d..87f162b 100644 +--- a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs ++++ b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs +@@ -1,6 +1,8 @@ + using System; ++using System.Collections; + using System.Collections.Generic; + using System.Linq; ++using System.Reflection; + using System.Text.Json; + using Xunit; + using OpenClaw.Shared; +@@ -66,6 +68,54 @@ public SessionInfo[] GetSessionList() + return _client.GetSessionList(); + } + ++ public string GetDefaultChatSessionKey() ++ { ++ return GetPrivateField("_defaultChatSessionKey"); ++ } ++ ++ public void UpdateDefaultChatSessionKeyFromHello(string payloadJson) ++ { ++ using var doc = JsonDocument.Parse(payloadJson); ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "UpdateDefaultChatSessionKeyFromHello", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ method!.Invoke(_client, new object[] { doc.RootElement.Clone() }); ++ } ++ ++ public string SerializeChatSendRequest(string message, string sessionKey, string idempotencyKey) ++ { ++ var parametersMethod = typeof(OpenClawGatewayClient).GetMethod( ++ "BuildChatSendParameters", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ var parameters = parametersMethod!.Invoke(_client, new object[] { message, sessionKey, idempotencyKey }); ++ ++ var serializeMethod = typeof(OpenClawGatewayClient).GetMethod( ++ "SerializeRequest", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); ++ return (string)serializeMethod!.Invoke(null, new object[] { "request-123", "chat.send", parameters! })!; ++ } ++ ++ public string SerializeConnectRequest() ++ { ++ var parametersMethod = typeof(OpenClawGatewayClient).GetMethod( ++ "BuildConnectParameters", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ var parameters = parametersMethod!.Invoke(_client, Array.Empty()); ++ ++ var serializeMethod = typeof(OpenClawGatewayClient).GetMethod( ++ "SerializeRequest", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); ++ return (string)serializeMethod!.Invoke(null, new object[] { "request-456", "connect", parameters! })!; ++ } ++ ++ public string NormalizeChatSessionKey(string? sessionKey) ++ { ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "NormalizeChatSessionKey", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); ++ return (string)method!.Invoke(null, new object?[] { sessionKey })!; ++ } ++ + public void SetUnsupportedMethodFlags(bool usageStatus, bool usageCost, bool sessionPreview, bool nodeList) + { + SetPrivateField("_usageStatusUnsupported", usageStatus); +@@ -122,6 +172,70 @@ public SessionsPreviewPayloadInfo ParseSessionsPreviewPayload(string payloadJson + return parsed ?? new SessionsPreviewPayloadInfo(); + } + ++ public ChatMessageEventArgs? HandleChatEventAndCaptureMessage(string payloadJson) ++ { ++ ChatMessageEventArgs? captured = null; ++ EventHandler handler = (_, args) => captured = args; ++ _client.ChatMessageReceived += handler; ++ ++ try ++ { ++ using var doc = JsonDocument.Parse(payloadJson); ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "HandleChatEvent", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ method!.Invoke(_client, new object[] { doc.RootElement.Clone() }); ++ } ++ finally ++ { ++ _client.ChatMessageReceived -= handler; ++ } ++ ++ return captured; ++ } ++ ++ public int GetPendingChatPreviewSessionCount() ++ { ++ var pending = GetPrivateField("_pendingChatPreviewSessionKeys"); ++ return pending.Count; ++ } ++ ++ public void AddPendingChatPreviewSession(string sessionKey, string? lastKnownAssistantText = null, int attemptCount = 0) ++ { ++ var pending = GetPrivateField("_pendingChatPreviewSessionKeys"); ++ var stateType = typeof(OpenClawGatewayClient).GetNestedType( ++ "PendingChatPreviewState", ++ BindingFlags.NonPublic)!; ++ var state = Activator.CreateInstance(stateType)!; ++ stateType.GetProperty("LastKnownAssistantText")!.SetValue(state, lastKnownAssistantText); ++ stateType.GetProperty("AttemptCount")!.SetValue(state, attemptCount); ++ pending[sessionKey] = state; ++ } ++ ++ public void SetLastAssistantMessage(string sessionKey, string text) ++ { ++ var lastMessages = GetPrivateField("_lastAssistantMessagesBySession"); ++ lastMessages[sessionKey] = text; ++ } ++ ++ public ChatMessageEventArgs? ParseSessionsPreviewPayloadAndCaptureMessage(string payloadJson) ++ { ++ ChatMessageEventArgs? captured = null; ++ EventHandler handler = (_, args) => captured = args; ++ _client.ChatMessageReceived += handler; ++ ++ try ++ { ++ InvokePrivatePayloadParser("ParseSessionsPreview", payloadJson); ++ } ++ finally ++ { ++ _client.ChatMessageReceived -= handler; ++ } ++ ++ return captured; ++ } ++ + public GatewayNodeInfo[] ParseNodeListPayload(string payloadJson) + { + GatewayNodeInfo[] parsed = Array.Empty(); +@@ -670,4 +784,162 @@ public void ParseChannelHealth_StatusField_TakesPriorityOverDerivedStatus() + Assert.Single(channels); + Assert.Equal("degraded", channels[0].Status); + } ++ ++ [Fact] ++ public void UpdateDefaultChatSessionKeyFromHello_UsesSnapshotMainSessionKey() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ helper.UpdateDefaultChatSessionKeyFromHello(""" ++ { ++ "type": "hello-ok", ++ "snapshot": { ++ "sessionDefaults": { ++ "mainSessionKey": "agent:main:main" ++ } ++ } ++ } ++ """); ++ ++ Assert.Equal("main", helper.GetDefaultChatSessionKey()); ++ } ++ ++ [Fact] ++ public void ParseSessions_MainSession_UpdatesDefaultChatSessionKey() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ helper.ParseSessionsPayload(""" ++ { ++ "agent:main:main": { ++ "status": "active", ++ "displayName": "Main", ++ "isMain": true ++ }, ++ "agent:other:test": { ++ "status": "active" ++ } ++ } ++ """); ++ ++ Assert.Equal("main", helper.GetDefaultChatSessionKey()); ++ } ++ ++ [Fact] ++ public void SerializeChatSendRequest_IncludesSessionKeyAndIdempotencyKey() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ var json = helper.SerializeChatSendRequest("hello", "main", "idem-123"); ++ using var doc = JsonDocument.Parse(json); ++ var parameters = doc.RootElement.GetProperty("params"); ++ ++ Assert.Equal("hello", parameters.GetProperty("message").GetString()); ++ Assert.Equal("main", parameters.GetProperty("sessionKey").GetString()); ++ Assert.Equal("idem-123", parameters.GetProperty("idempotencyKey").GetString()); ++ } ++ ++ [Fact] ++ public void NormalizeChatSessionKey_CollapsesExpandedMainKey() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ Assert.Equal("main", helper.NormalizeChatSessionKey("agent:main:main")); ++ Assert.Equal("main", helper.NormalizeChatSessionKey("main")); ++ Assert.Equal("agent:sub:test", helper.NormalizeChatSessionKey("agent:sub:test")); ++ } ++ ++ [Fact] ++ public void HandleChatEvent_FinalWithoutMessage_QueuesPreviewLookup() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ var captured = helper.HandleChatEventAndCaptureMessage(""" ++ { ++ "type": "event", ++ "event": "chat", ++ "payload": { ++ "sessionKey": "agent:main:main", ++ "state": "final" ++ } ++ } ++ """); ++ ++ Assert.Null(captured); ++ Assert.Equal(1, helper.GetPendingChatPreviewSessionCount()); ++ } ++ ++ [Fact] ++ public void ParseSessionsPreview_EmitsAssistantMessage_ForQueuedFinalPreview() ++ { ++ var helper = new GatewayClientTestHelper(); ++ helper.AddPendingChatPreviewSession("main"); ++ ++ var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage(""" ++ { ++ "ts": 1739760000000, ++ "previews": [ ++ { ++ "key": "agent:main:main", ++ "status": "ok", ++ "items": [ ++ { "role": "user", "text": "hello" }, ++ { "role": "assistant", "text": "world" } ++ ] ++ } ++ ] ++ } ++ """); ++ ++ Assert.NotNull(captured); ++ Assert.Equal("main", captured!.SessionKey); ++ Assert.Equal("assistant", captured.Role); ++ Assert.Equal("world", captured.Message); ++ Assert.True(captured.IsFinal); ++ Assert.Equal(0, helper.GetPendingChatPreviewSessionCount()); ++ } ++ ++ [Fact] ++ public void ParseSessionsPreview_DoesNotEmitStaleAssistantMessage_ForQueuedFinalPreview() ++ { ++ var helper = new GatewayClientTestHelper(); ++ helper.SetUnsupportedMethodFlags(usageStatus: false, usageCost: false, sessionPreview: true, nodeList: false); ++ helper.SetLastAssistantMessage("main", "world"); ++ helper.AddPendingChatPreviewSession("main", lastKnownAssistantText: "world"); ++ ++ var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage(""" ++ { ++ "ts": 1739760000000, ++ "previews": [ ++ { ++ "key": "agent:main:main", ++ "status": "ok", ++ "items": [ ++ { "role": "user", "text": "hello again" }, ++ { "role": "assistant", "text": "world" } ++ ] ++ } ++ ] ++ } ++ """); ++ ++ Assert.Null(captured); ++ Assert.Equal(1, helper.GetPendingChatPreviewSessionCount()); ++ } ++ ++ [Fact] ++ public void SerializeConnectRequest_UsesCliClientModeAndOperatorScopes() ++ { ++ var helper = new GatewayClientTestHelper(); ++ ++ var json = helper.SerializeConnectRequest(); ++ using var doc = JsonDocument.Parse(json); ++ var parameters = doc.RootElement.GetProperty("params"); ++ var client = parameters.GetProperty("client"); ++ var scopes = parameters.GetProperty("scopes").EnumerateArray().Select(item => item.GetString()).ToArray(); ++ ++ Assert.Equal("cli", client.GetProperty("mode").GetString()); ++ Assert.Contains("operator.read", scopes); ++ Assert.Contains("operator.write", scopes); ++ } + } +diff --git a/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs b/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs +new file mode 100644 +index 0000000..2f3323e +--- /dev/null ++++ b/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs +@@ -0,0 +1,141 @@ ++using OpenClaw.Shared; ++using System.Text.Json; ++ ++namespace OpenClaw.Shared.Tests; ++ ++public class VoiceCommandsTests ++{ ++ [Fact] ++ public void All_ContainsExpectedCommandsInStableOrder() ++ { ++ Assert.Equal( ++ [ ++ "voice.devices.list", ++ "voice.settings.get", ++ "voice.settings.set", ++ "voice.status.get", ++ "voice.start", ++ "voice.stop", ++ "voice.pause", ++ "voice.resume", ++ "voice.response.skip" ++ ], ++ VoiceCommands.All); ++ } ++} ++ ++public class VoiceSchemaDefaultsTests ++{ ++ [Fact] ++ public void VoiceSettings_Defaults_AreConcreteAndProviderAgnostic() ++ { ++ var settings = new VoiceSettings(); ++ ++ Assert.False(settings.Enabled); ++ Assert.Equal(VoiceActivationMode.Off, settings.Mode); ++ Assert.False(settings.ShowConversationToasts); ++ Assert.True(settings.StripInjectedMemoriesInChat); ++ Assert.Equal(VoiceProviderIds.Windows, settings.SpeechToTextProviderId); ++ Assert.Equal(VoiceProviderIds.Windows, settings.TextToSpeechProviderId); ++ Assert.Equal(16000, settings.SampleRateHz); ++ Assert.Equal(80, settings.CaptureChunkMs); ++ Assert.True(settings.BargeInEnabled); ++ Assert.Equal("NanoWakeWord", settings.VoiceWake.Engine); ++ Assert.Equal("hey_openclaw", settings.VoiceWake.ModelId); ++ Assert.Equal(0.65f, settings.VoiceWake.TriggerThreshold); ++ Assert.Equal(250, settings.TalkMode.MinSpeechMs); ++ } ++ ++ [Fact] ++ public void VoiceStatusInfo_Defaults_ToStopped() ++ { ++ var status = new VoiceStatusInfo(); ++ ++ Assert.False(status.Available); ++ Assert.False(status.Running); ++ Assert.Equal(VoiceActivationMode.Off, status.Mode); ++ Assert.Equal(VoiceRuntimeState.Stopped, status.State); ++ Assert.False(status.VoiceWakeLoaded); ++ Assert.Equal(0, status.PendingReplyCount); ++ Assert.False(status.CanSkipReply); ++ Assert.Null(status.CurrentReplyPreview); ++ Assert.Null(status.LastError); ++ } ++ ++ [Fact] ++ public void VoiceEnums_Serialize_AsStrings() ++ { ++ var json = JsonSerializer.Serialize(new VoiceStartArgs ++ { ++ Mode = VoiceActivationMode.VoiceWake ++ }); ++ ++ Assert.Contains("\"VoiceWake\"", json); ++ } ++ ++ [Fact] ++ public void VoiceProviderCatalog_Defaults_ToEmptyLists() ++ { ++ var catalog = new VoiceProviderCatalog(); ++ ++ Assert.Empty(catalog.SpeechToTextProviders); ++ Assert.Empty(catalog.TextToSpeechProviders); ++ } ++ ++ [Fact] ++ public void VoiceProviderIds_ExposeRequiredBuiltInProviders() ++ { ++ Assert.Equal("windows", VoiceProviderIds.Windows); ++ Assert.Equal("foundry-local", VoiceProviderIds.FoundryLocal); ++ Assert.Equal("openai-whisper", VoiceProviderIds.OpenAiWhisper); ++ Assert.Equal("elevenlabs-stt", VoiceProviderIds.ElevenLabsSpeechToText); ++ Assert.Equal("azure-ai-speech", VoiceProviderIds.AzureAiSpeech); ++ Assert.Equal("sherpa-onnx", VoiceProviderIds.SherpaOnnx); ++ Assert.Equal("minimax", VoiceProviderIds.MiniMax); ++ Assert.Equal("elevenlabs", VoiceProviderIds.ElevenLabs); ++ Assert.Equal("endpoint", VoiceProviderSettingKeys.Endpoint); ++ Assert.Equal("modelPath", VoiceProviderSettingKeys.ModelPath); ++ Assert.Equal("voiceSettingsJson", VoiceProviderSettingKeys.VoiceSettingsJson); ++ } ++ ++ [Fact] ++ public void VoiceProviderOption_Defaults_ToVisibleAndSelectable() ++ { ++ var option = new VoiceProviderOption { Name = "Provider" }; ++ ++ Assert.True(option.VisibleInSettings); ++ Assert.True(option.Selectable); ++ Assert.Equal("Provider", option.DisplayName); ++ Assert.Equal(1.0, option.DisplayOpacity); ++ } ++ ++ [Fact] ++ public void VoiceProviderConfigurationStore_Defaults_ToEmptyProviders() ++ { ++ var configuration = new VoiceProviderConfigurationStore(); ++ ++ Assert.Empty(configuration.Providers); ++ } ++ ++ [Fact] ++ public void VoiceProviderConfigurationStore_MigratesLegacyProviderCredentials() ++ { ++ var configuration = new VoiceProviderConfigurationStore(); ++ configuration.MigrateLegacyCredentials(new VoiceProviderCredentials ++ { ++ MiniMaxApiKey = "minimax-key", ++ MiniMaxModel = "speech-2.8-turbo", ++ MiniMaxVoiceId = "English_MatureBoss", ++ ElevenLabsApiKey = "eleven-key", ++ ElevenLabsModel = "eleven_multilingual_v2", ++ ElevenLabsVoiceId = "voice-42" ++ }); ++ ++ Assert.Equal("minimax-key", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey)); ++ Assert.Equal("speech-2.8-turbo", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model)); ++ Assert.Equal("English_MatureBoss", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId)); ++ Assert.Equal("eleven-key", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey)); ++ Assert.Equal("eleven_multilingual_v2", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model)); ++ Assert.Equal("voice-42", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId)); ++ } ++} +diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +index f795ca7..cb7fa46 100644 +--- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj ++++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +@@ -1,7 +1,9 @@ + + + +- net10.0 ++ net10.0-windows10.0.19041.0 ++ win-x64 ++ x64 + enable + enable + false +@@ -19,6 +21,7 @@ + + + ++ + + + +diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +index 8b09519..2b4f5a1 100644 +--- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs ++++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +@@ -1,3 +1,4 @@ ++using System.Collections.Generic; + using System.Text.Json; + using OpenClaw.Shared; + +@@ -28,6 +29,62 @@ public void RoundTrip_AllFields_Preserved() + HasSeenActivityStreamTip = true, + NotifyChatResponses = false, + PreferStructuredCategories = true, ++ Voice = new VoiceSettings ++ { ++ Enabled = true, ++ Mode = VoiceActivationMode.VoiceWake, ++ ShowConversationToasts = true, ++ StripInjectedMemoriesInChat = false, ++ SpeechToTextProviderId = "windows", ++ TextToSpeechProviderId = "elevenlabs", ++ InputDeviceId = "mic-1", ++ OutputDeviceId = "spk-2", ++ SampleRateHz = 16000, ++ CaptureChunkMs = 80, ++ BargeInEnabled = false, ++ VoiceWake = new VoiceWakeSettings ++ { ++ Engine = "NanoWakeWord", ++ ModelId = "hey_openclaw", ++ TriggerThreshold = 0.72f, ++ TriggerCooldownMs = 2500, ++ PreRollMs = 1400, ++ EndSilenceMs = 1000 ++ }, ++ TalkMode = new TalkModeSettings ++ { ++ MinSpeechMs = 300, ++ EndSilenceMs = 1100, ++ MaxUtteranceMs = 18000 ++ } ++ }, ++ VoiceProviderConfiguration = new VoiceProviderConfigurationStore ++ { ++ Providers = ++ [ ++ new VoiceProviderConfiguration ++ { ++ ProviderId = VoiceProviderIds.MiniMax, ++ Values = new Dictionary ++ { ++ [VoiceProviderSettingKeys.ApiKey] = "minimax-key", ++ [VoiceProviderSettingKeys.Model] = "speech-2.8-turbo", ++ [VoiceProviderSettingKeys.VoiceId] = "English_MatureBoss", ++ [VoiceProviderSettingKeys.VoiceSettingsJson] = "{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}" ++ } ++ }, ++ new VoiceProviderConfiguration ++ { ++ ProviderId = VoiceProviderIds.ElevenLabs, ++ Values = new Dictionary ++ { ++ [VoiceProviderSettingKeys.ApiKey] = "eleven-key", ++ [VoiceProviderSettingKeys.Model] = "eleven_multilingual_v2", ++ [VoiceProviderSettingKeys.VoiceId] = "voice-42" ++ } ++ } ++ ] ++ }, + UserRules = new List + { + new() { Pattern = "build.*fail", IsRegex = true, Category = "urgent", Enabled = true } +@@ -56,6 +113,27 @@ public void RoundTrip_AllFields_Preserved() + Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip); + Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); + Assert.Equal(original.PreferStructuredCategories, restored.PreferStructuredCategories); ++ Assert.NotNull(restored.Voice); ++ Assert.True(restored.Voice.Enabled); ++ Assert.Equal(VoiceActivationMode.VoiceWake, restored.Voice.Mode); ++ Assert.True(restored.Voice.ShowConversationToasts); ++ Assert.False(restored.Voice.StripInjectedMemoriesInChat); ++ Assert.Equal("windows", restored.Voice.SpeechToTextProviderId); ++ Assert.Equal("elevenlabs", restored.Voice.TextToSpeechProviderId); ++ Assert.Equal("mic-1", restored.Voice.InputDeviceId); ++ Assert.Equal("spk-2", restored.Voice.OutputDeviceId); ++ Assert.Equal("NanoWakeWord", restored.Voice.VoiceWake.Engine); ++ Assert.Equal("hey_openclaw", restored.Voice.VoiceWake.ModelId); ++ Assert.Equal(0.72f, restored.Voice.VoiceWake.TriggerThreshold); ++ Assert.Equal(300, restored.Voice.TalkMode.MinSpeechMs); ++ Assert.NotNull(restored.VoiceProviderConfiguration); ++ Assert.Equal("minimax-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey)); ++ Assert.Equal("speech-2.8-turbo", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model)); ++ Assert.Equal("English_MatureBoss", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId)); ++ Assert.Equal("{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceSettingsJson)); ++ Assert.Equal("eleven-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey)); ++ Assert.Equal("eleven_multilingual_v2", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model)); ++ Assert.Equal("voice-42", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId)); + Assert.NotNull(restored.UserRules); + Assert.Single(restored.UserRules); + Assert.Equal("build.*fail", restored.UserRules[0].Pattern); +@@ -101,9 +179,42 @@ public void MissingFields_UseDefaults() + Assert.False(settings.HasSeenActivityStreamTip); + Assert.True(settings.NotifyChatResponses); + Assert.True(settings.PreferStructuredCategories); ++ Assert.NotNull(settings.Voice); ++ Assert.False(settings.Voice.Enabled); ++ Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode); ++ Assert.False(settings.Voice.ShowConversationToasts); ++ Assert.True(settings.Voice.StripInjectedMemoriesInChat); ++ Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId); ++ Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId); ++ Assert.NotNull(settings.VoiceProviderConfiguration); ++ Assert.Empty(settings.VoiceProviderConfiguration.Providers); ++ Assert.Equal(16000, settings.Voice.SampleRateHz); ++ Assert.Equal("NanoWakeWord", settings.Voice.VoiceWake.Engine); + Assert.Null(settings.UserRules); + } + ++ [Fact] ++ public void LegacyVoiceProviderCredentials_Deserialize_ForMigration() ++ { ++ var json = """ ++ { ++ "VoiceProviderCredentials": { ++ "MiniMaxApiKey": "minimax-key", ++ "MiniMaxModel": "speech-2.8-turbo", ++ "MiniMaxVoiceId": "English_MatureBoss" ++ } ++ } ++ """; ++ ++ var settings = SettingsData.FromJson(json); ++ ++ Assert.NotNull(settings); ++ Assert.NotNull(settings.VoiceProviderCredentials); ++ Assert.Equal("minimax-key", settings.VoiceProviderCredentials.MiniMaxApiKey); ++ Assert.Equal("speech-2.8-turbo", settings.VoiceProviderCredentials.MiniMaxModel); ++ Assert.Equal("English_MatureBoss", settings.VoiceProviderCredentials.MiniMaxVoiceId); ++ } ++ + [Fact] + public void BackwardCompatibility_OldSettingsWithoutNewFields() + { +@@ -137,6 +248,13 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() + Assert.False(settings.EnableNodeMode); + Assert.False(settings.HasSeenActivityStreamTip); + Assert.True(settings.GlobalHotkeyEnabled); ++ Assert.NotNull(settings.Voice); ++ Assert.False(settings.Voice.Enabled); ++ Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode); ++ Assert.False(settings.Voice.ShowConversationToasts); ++ Assert.True(settings.Voice.StripInjectedMemoriesInChat); ++ Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId); ++ Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId); + Assert.Null(settings.UserRules); + } + +diff --git a/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs b/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs +new file mode 100644 +index 0000000..379991d +--- /dev/null ++++ b/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs +@@ -0,0 +1,221 @@ ++using OpenClaw.Shared; ++using OpenClawTray.Services.Voice; ++ ++namespace OpenClaw.Tray.Tests; ++ ++public class VoiceChatCoordinatorTests ++{ ++ [Fact] ++ public async Task AttachWindow_ReplaysBufferedDraft() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ ++ runtime.RaiseDraft("hello world", "main", clear: false); ++ ++ var window = new FakeVoiceChatWindow(); ++ coordinator.AttachWindow(window); ++ await Task.Yield(); ++ ++ Assert.Equal("hello world", window.LastDraftText); ++ Assert.False(window.LastDraftClear); ++ } ++ ++ [Fact] ++ public async Task DraftClear_IsReplayedWhenWindowAttachesLater() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ ++ runtime.RaiseDraft("temporary draft", "main", clear: false); ++ runtime.RaiseDraft(string.Empty, "main", clear: true); ++ await Task.Yield(); ++ ++ var window = new FakeVoiceChatWindow(); ++ coordinator.AttachWindow(window); ++ await Task.Yield(); ++ ++ Assert.Equal(string.Empty, window.LastDraftText); ++ Assert.True(window.LastDraftClear); ++ } ++ ++ [Fact] ++ public async Task DraftUpdates_AreIgnoredForClosedWindow() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ var window = new FakeVoiceChatWindow { IsClosed = true }; ++ coordinator.AttachWindow(window); ++ var updateCountAfterAttach = window.UpdateCallCount; ++ ++ runtime.RaiseDraft("headless text", "main", clear: false); ++ await Task.Yield(); ++ ++ Assert.Equal(updateCountAfterAttach, window.UpdateCallCount); ++ } ++ ++ [Fact] ++ public async Task DetachWindow_StopsFurtherDraftMirroring() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ var window = new FakeVoiceChatWindow(); ++ coordinator.AttachWindow(window); ++ ++ coordinator.DetachWindow(window); ++ runtime.RaiseDraft("after detach", "main", clear: false); ++ await Task.Yield(); ++ ++ Assert.Equal(1, window.UpdateCallCount); ++ Assert.Equal(string.Empty, window.LastDraftText); ++ Assert.True(window.LastDraftClear); ++ } ++ ++ [Fact] ++ public void ConversationTurn_IsForwarded() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ VoiceConversationTurnEventArgs? received = null; ++ coordinator.ConversationTurnAvailable += (_, args) => received = args; ++ ++ runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs ++ { ++ Direction = VoiceConversationDirection.Incoming, ++ Message = "reply", ++ SessionKey = "main" ++ }); ++ ++ Assert.NotNull(received); ++ Assert.Equal("reply", received!.Message); ++ Assert.Equal(VoiceConversationDirection.Incoming, received.Direction); ++ } ++ ++ [Fact] ++ public async Task ConversationTurn_IsMirroredToAttachedWindow() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ var window = new FakeVoiceChatWindow(); ++ coordinator.AttachWindow(window); ++ ++ runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs ++ { ++ Direction = VoiceConversationDirection.Outgoing, ++ Message = "hello from voice", ++ SessionKey = "main" ++ }); ++ await Task.Yield(); ++ ++ Assert.Equal("hello from voice", window.LastTurnMessage); ++ Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection); ++ Assert.Equal(1, window.TurnCallCount); ++ } ++ ++ [Fact] ++ public async Task AttachWindow_ReplaysBufferedConversationTurns() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ ++ runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs ++ { ++ Direction = VoiceConversationDirection.Outgoing, ++ Message = "replay this", ++ SessionKey = "main" ++ }); ++ await Task.Yield(); ++ ++ var window = new FakeVoiceChatWindow(); ++ coordinator.AttachWindow(window); ++ await Task.Yield(); ++ ++ Assert.Equal("replay this", window.LastTurnMessage); ++ Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection); ++ Assert.Equal(1, window.TurnCallCount); ++ } ++ ++ [Fact] ++ public async Task DraftAndTurns_AreBroadcastToAllAttachedWindows() ++ { ++ var runtime = new FakeVoiceRuntime(); ++ using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); ++ var firstWindow = new FakeVoiceChatWindow(); ++ var secondWindow = new FakeVoiceChatWindow(); ++ ++ coordinator.AttachWindow(firstWindow); ++ coordinator.AttachWindow(secondWindow); ++ ++ runtime.RaiseDraft("shared draft", "main", clear: false); ++ runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs ++ { ++ Direction = VoiceConversationDirection.Incoming, ++ Message = "shared reply", ++ SessionKey = "main" ++ }); ++ await Task.Yield(); ++ ++ Assert.Equal("shared draft", firstWindow.LastDraftText); ++ Assert.Equal("shared draft", secondWindow.LastDraftText); ++ Assert.Equal("shared reply", firstWindow.LastTurnMessage); ++ Assert.Equal("shared reply", secondWindow.LastTurnMessage); ++ } ++ ++ private sealed class ImmediateDispatcher : IUiDispatcher ++ { ++ public bool TryEnqueue(Action callback) ++ { ++ callback(); ++ return true; ++ } ++ } ++ ++ private sealed class FakeVoiceRuntime : IVoiceRuntime ++ { ++ public event EventHandler? ConversationTurnAvailable; ++ public event EventHandler? TranscriptDraftUpdated; ++ ++ public void RaiseDraft(string text, string? sessionKey, bool clear) ++ { ++ TranscriptDraftUpdated?.Invoke(this, new VoiceTranscriptDraftEventArgs ++ { ++ Text = text, ++ SessionKey = sessionKey ?? "main", ++ Clear = clear ++ }); ++ } ++ ++ public void RaiseConversationTurn(VoiceConversationTurnEventArgs args) ++ { ++ ConversationTurnAvailable?.Invoke(this, args); ++ } ++ } ++ ++ private sealed class FakeVoiceChatWindow : IVoiceChatWindow ++ { ++ public bool IsClosed { get; set; } ++ ++ public string LastDraftText { get; private set; } = string.Empty; ++ public bool LastDraftClear { get; private set; } ++ public int UpdateCallCount { get; private set; } ++ public string LastTurnMessage { get; private set; } = string.Empty; ++ public VoiceConversationDirection? LastTurnDirection { get; private set; } ++ public int TurnCallCount { get; private set; } ++ ++ public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) ++ { ++ UpdateCallCount++; ++ LastDraftText = text; ++ LastDraftClear = clear; ++ return Task.CompletedTask; ++ } ++ ++ public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) ++ { ++ TurnCallCount++; ++ LastTurnMessage = args.Message ?? string.Empty; ++ LastTurnDirection = args.Direction; ++ return Task.CompletedTask; ++ } ++ } ++} +diff --git a/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs b/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs +new file mode 100644 +index 0000000..75cefc0 +--- /dev/null ++++ b/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs +@@ -0,0 +1,75 @@ ++using System; ++using System.Reflection; ++using System.Threading; ++using System.Threading.Tasks; ++using OpenClaw.Shared; ++using OpenClawTray.Services.Voice; ++ ++namespace OpenClaw.Tray.Tests; ++ ++public class VoiceCloudTextToSpeechClientTests ++{ ++ [Fact] ++ public async Task SynthesizeAsync_ThrowsOperationCanceled_WhenCallerTokenIsPreCancelled() ++ { ++ var client = new VoiceCloudTextToSpeechClient(); ++ var provider = new VoiceProviderOption ++ { ++ Id = "test-ws", ++ Name = "Test WS", ++ Settings = ++ [ ++ new VoiceProviderSettingDefinition { Key = "apiKey", Secret = true } ++ ], ++ TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract ++ { ++ EndpointTemplate = "wss://127.0.0.1:0/tts" ++ } ++ }; ++ var store = new VoiceProviderConfigurationStore(); ++ store.SetValue("test-ws", "apiKey", "test-key"); ++ ++ using var cts = new CancellationTokenSource(); ++ cts.Cancel(); ++ ++ await Assert.ThrowsAnyAsync( ++ () => client.SynthesizeAsync("hello", provider, store, cancellationToken: cts.Token)); ++ } ++ ++ [Fact] ++ public void DecodeAudioBytes_DecodesHexString() ++ { ++ var result = InvokeDecodeAudioBytes("hexJsonString", "48656c6c6f", "TestProvider"); ++ ++ Assert.Equal([72, 101, 108, 108, 111], result); // "Hello" ++ } ++ ++ [Fact] ++ public void DecodeAudioBytes_DecodesBase64String() ++ { ++ var result = InvokeDecodeAudioBytes("base64JsonString", "SGVsbG8=", "TestProvider"); ++ ++ Assert.Equal([72, 101, 108, 108, 111], result); // "Hello" ++ } ++ ++ [Fact] ++ public void DecodeAudioBytes_ThrowsForUnsupportedMode() ++ { ++ var method = GetDecodeAudioBytesMethod(); ++ ++ var ex = Assert.Throws( ++ () => method.Invoke(null, ["unsupported", "data", "TestProvider"])); ++ ++ Assert.IsType(ex.InnerException); ++ } ++ ++ private static byte[] InvokeDecodeAudioBytes(string mode, string value, string providerName) ++ { ++ return (byte[])GetDecodeAudioBytesMethod().Invoke(null, [mode, value, providerName])!; ++ } ++ ++ private static MethodInfo GetDecodeAudioBytesMethod() => ++ typeof(VoiceCloudTextToSpeechClient).GetMethod( ++ "DecodeAudioBytes", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++} +diff --git a/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs b/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs +new file mode 100644 +index 0000000..f6ff8ca +--- /dev/null ++++ b/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs +@@ -0,0 +1,131 @@ ++using System; ++using System.IO; ++using OpenClaw.Shared; ++using OpenClawTray.Helpers; ++using OpenClawTray.Services.Voice; ++using System.Linq; ++ ++namespace OpenClaw.Tray.Tests; ++ ++public class VoiceProviderCatalogServiceTests ++{ ++ [Fact] ++ public void GetVoiceTrayIconPath_ReturnsBundledAppIconForOff() ++ { ++ var path = IconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off); ++ ++ Assert.Equal(IconHelper.GetAppIconPath(), path, ignoreCase: true); ++ } ++ ++ [Fact] ++ public void GetVoiceTrayIconPath_GeneratesListeningVariant() ++ { ++ var path = IconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Listening); ++ ++ Assert.True(File.Exists(path)); ++ Assert.EndsWith(".ico", path, StringComparison.OrdinalIgnoreCase); ++ Assert.NotEqual(IconHelper.GetAppIconPath(), path, StringComparer.OrdinalIgnoreCase); ++ } ++ ++ [Fact] ++ public void CatalogFilePath_ResolvesToExistingBundledAsset() ++ { ++ Assert.EndsWith("voice-providers.json", VoiceProviderCatalogService.CatalogFilePath, StringComparison.OrdinalIgnoreCase); ++ Assert.True(File.Exists(VoiceProviderCatalogService.CatalogFilePath)); ++ } ++ ++ [Fact] ++ public void LoadCatalog_IncludesOnlySelectableAndVisibleSpeechProviders() ++ { ++ var catalog = VoiceProviderCatalogService.LoadCatalog(); ++ ++ Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.Windows); ++ Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx); ++ Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.FoundryLocal); ++ Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.OpenAiWhisper); ++ Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.ElevenLabsSpeechToText); ++ Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.AzureAiSpeech); ++ Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.Windows); ++ Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax); ++ Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs); ++ } ++ ++ [Fact] ++ public void SupportsSpeechToTextRuntime_ReturnsTrueOnlyForWindowsMediaRoute() ++ { ++ Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.Windows)); ++ Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.FoundryLocal)); ++ Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.OpenAiWhisper)); ++ Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.ElevenLabsSpeechToText)); ++ Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.AzureAiSpeech)); ++ Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.SherpaOnnx)); ++ } ++ ++ [Fact] ++ public void SupportsTextToSpeechRuntime_ReturnsTrueForImplementedProviders() ++ { ++ Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.Windows)); ++ Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.MiniMax)); ++ Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.ElevenLabs)); ++ } ++ ++ [Fact] ++ public void LoadCatalog_ExposesBuiltInCloudTtsContracts() ++ { ++ var catalog = VoiceProviderCatalogService.LoadCatalog(); ++ ++ var sherpaOnnx = Assert.Single(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx); ++ Assert.Equal(VoiceProviderRuntimeIds.Embedded, sherpaOnnx.Runtime); ++ Assert.False(sherpaOnnx.Enabled); ++ Assert.True(sherpaOnnx.VisibleInSettings); ++ Assert.False(sherpaOnnx.Selectable); ++ Assert.Equal(string.Empty, sherpaOnnx.Settings.Single(s => s.Key == VoiceProviderSettingKeys.ModelPath).DefaultValue); ++ ++ var minimax = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax); ++ Assert.Equal("MiniMax", minimax.Name); ++ Assert.NotNull(minimax.TextToSpeechWebSocket); ++ Assert.Equal("wss://api.minimax.io/ws/v1/t2a_v2", minimax.TextToSpeechWebSocket!.EndpointTemplate); ++ Assert.Equal("Authorization", minimax.TextToSpeechWebSocket.AuthenticationHeaderName); ++ Assert.Equal(VoiceTextToSpeechResponseModes.HexJsonString, minimax.TextToSpeechWebSocket.ResponseAudioMode); ++ Assert.Contains("\"event\": \"task_start\"", minimax.TextToSpeechWebSocket.StartMessageTemplate); ++ Assert.Contains("\"event\": \"task_continue\"", minimax.TextToSpeechWebSocket.ContinueMessageTemplate); ++ var minimaxModelSetting = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model); ++ Assert.Equal("speech-2.8-turbo", minimaxModelSetting.DefaultValue); ++ Assert.Contains("speech-2.8-turbo", minimaxModelSetting.Options); ++ Assert.Contains("speech-2.5-turbo-preview", minimaxModelSetting.Options); ++ Assert.Equal("English_MatureBoss", minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue); ++ var minimaxVoiceSettingsJson = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson); ++ Assert.False(minimaxVoiceSettingsJson.Required); ++ Assert.True(minimaxVoiceSettingsJson.JsonValue); ++ Assert.Contains("\"voice_setting\":", minimaxVoiceSettingsJson.Placeholder); ++ Assert.Contains("{{voiceId}}", minimaxVoiceSettingsJson.DefaultValue); ++ ++ var elevenLabs = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs); ++ Assert.Equal("ElevenLabs", elevenLabs.Name); ++ Assert.NotNull(elevenLabs.TextToSpeechWebSocket); ++ Assert.Equal( ++ "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", ++ elevenLabs.TextToSpeechWebSocket!.EndpointTemplate); ++ Assert.Equal("xi-api-key", elevenLabs.TextToSpeechWebSocket.AuthenticationHeaderName); ++ Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.AuthenticationScheme); ++ Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.ConnectSuccessEventName); ++ Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.StartSuccessEventName); ++ Assert.Contains("\"xi_api_key\": {{apiKey}}", elevenLabs.TextToSpeechWebSocket.StartMessageTemplate); ++ Assert.Contains("\"try_trigger_generation\": true", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate); ++ Assert.Contains("{{textWithTrailingSpace}}", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate); ++ Assert.Equal("{ \"text\": \"\" }", elevenLabs.TextToSpeechWebSocket.FinishMessageTemplate); ++ Assert.Equal(VoiceTextToSpeechResponseModes.Base64JsonString, elevenLabs.TextToSpeechWebSocket.ResponseAudioMode); ++ Assert.Equal("audio", elevenLabs.TextToSpeechWebSocket.ResponseAudioJsonPath); ++ Assert.Equal("isFinal", elevenLabs.TextToSpeechWebSocket.FinalFlagJsonPath); ++ var elevenLabsModelSetting = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model); ++ Assert.Equal("eleven_multilingual_v2", elevenLabsModelSetting.DefaultValue); ++ Assert.Contains("eleven_flash_v2_5", elevenLabsModelSetting.Options); ++ Assert.Contains("eleven_turbo_v2_5", elevenLabsModelSetting.Options); ++ Assert.Equal("6aDn1KB0hjpdcocrUkmq", elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue); ++ var elevenLabsVoiceSettingsJson = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson); ++ Assert.False(elevenLabsVoiceSettingsJson.Required); ++ Assert.True(elevenLabsVoiceSettingsJson.JsonValue); ++ Assert.Contains("\"voice_settings\":", elevenLabsVoiceSettingsJson.DefaultValue); ++ Assert.Contains("\"speed\": 0.9", elevenLabsVoiceSettingsJson.DefaultValue); ++ } ++} +diff --git a/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs b/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs +new file mode 100644 +index 0000000..3a01919 +--- /dev/null ++++ b/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs +@@ -0,0 +1,419 @@ ++using System.Reflection; ++using OpenClaw.Shared; ++using OpenClawTray.Services.Voice; ++using Windows.Media.Devices; ++using Windows.Media.SpeechRecognition; ++ ++namespace OpenClaw.Tray.Tests; ++ ++public class VoiceServiceTransportTests ++{ ++ [Fact] ++ public void GetOrCreateTransportReadySource_ReusesExistingTaskWhileConnecting() ++ { ++ var method = GetMethod(); ++ var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ var arguments = new object?[] { ConnectionStatus.Connecting, existing, null }; ++ ++ var result = (TaskCompletionSource)method.Invoke(null, arguments)!; ++ ++ Assert.Same(existing, result); ++ Assert.False((bool)arguments[2]!); ++ } ++ ++ [Fact] ++ public void GetOrCreateTransportReadySource_CreatesFreshTaskWhenDisconnected() ++ { ++ var method = GetMethod(); ++ var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ var arguments = new object?[] { ConnectionStatus.Disconnected, existing, null }; ++ ++ var result = (TaskCompletionSource)method.Invoke(null, arguments)!; ++ ++ Assert.NotSame(existing, result); ++ Assert.True((bool)arguments[2]!); ++ } ++ ++ [Fact] ++ public void GetOrCreateTransportReadySource_CreatesFreshTaskAfterError() ++ { ++ var method = GetMethod(); ++ var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ var arguments = new object?[] { ConnectionStatus.Error, existing, null }; ++ ++ var result = (TaskCompletionSource)method.Invoke(null, arguments)!; ++ ++ Assert.NotSame(existing, result); ++ Assert.True((bool)arguments[2]!); ++ } ++ ++ [Fact] ++ public void UsesCloudTextToSpeechRuntime_ReturnsTrueForWebSocketProviders() ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "UsesCloudTextToSpeechRuntime", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var provider = new VoiceProviderOption ++ { ++ Id = VoiceProviderIds.MiniMax, ++ TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract ++ { ++ EndpointTemplate = "wss://example.test/tts" ++ } ++ }; ++ ++ var result = (bool)method.Invoke(null, [provider])!; ++ ++ Assert.True(result); ++ } ++ ++ [Theory] ++ [InlineData(true, false, 0, false, true)] ++ [InlineData(false, true, 0, false, true)] ++ [InlineData(false, false, 1, false, true)] ++ [InlineData(false, false, 0, true, true)] ++ [InlineData(false, false, 0, false, false)] ++ public void ShouldAcceptAssistantReply_MatchesPlaybackAndAwaitingState( ++ bool awaitingReply, ++ bool isSpeaking, ++ int queuedReplyCount, ++ bool acceptedViaLateReplyGrace, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldAcceptAssistantReply", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ var result = (bool)method.Invoke(null, [awaitingReply, isSpeaking, queuedReplyCount, acceptedViaLateReplyGrace])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(false, false, 0, "main", "main", 30, true)] ++ [InlineData(false, false, 0, "main", "main", 121, false)] ++ [InlineData(true, false, 0, "main", "main", 30, false)] ++ [InlineData(false, true, 0, "main", "main", 30, false)] ++ [InlineData(false, false, 1, "main", "main", 30, false)] ++ [InlineData(false, false, 0, "main", "other", 30, false)] ++ public void ShouldAcceptLateAssistantReply_OnlyMatchesBoundedGraceWindow( ++ bool awaitingReply, ++ bool isSpeaking, ++ int queuedReplyCount, ++ string lateReplySessionKey, ++ string incomingSessionKey, ++ int secondsAfterTimeout, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldAcceptLateAssistantReply", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ var timeoutUtc = new DateTime(2026, 3, 25, 0, 0, 0, DateTimeKind.Utc); ++ var graceUntilUtc = timeoutUtc.AddMinutes(2); ++ var result = (bool)method.Invoke( ++ null, ++ [ ++ awaitingReply, ++ isSpeaking, ++ queuedReplyCount, ++ lateReplySessionKey, ++ graceUntilUtc, ++ incomingSessionKey, ++ timeoutUtc.AddSeconds(secondsAfterTimeout) ++ ])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(true, false, false)] ++ [InlineData(false, true, false)] ++ [InlineData(false, false, true)] ++ public void ShouldRestartRecognitionAfterCompletion_SuppressesControlledRecycle( ++ bool restartInProgress, ++ bool awaitingReply, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldRestartRecognitionAfterCompletion", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke( ++ null, ++ [ ++ true, ++ VoiceActivationMode.TalkMode, ++ restartInProgress, ++ awaitingReply, ++ false ++ ])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(true, VoiceActivationMode.TalkMode, false, false, false, "eligible")] ++ [InlineData(true, VoiceActivationMode.VoiceWake, false, false, false, "mode=VoiceWake")] ++ [InlineData(false, VoiceActivationMode.TalkMode, false, false, false, "runtime-not-running")] ++ [InlineData(true, VoiceActivationMode.TalkMode, true, false, false, "controlled-restart-in-progress")] ++ [InlineData(true, VoiceActivationMode.TalkMode, false, true, false, "awaiting-reply")] ++ [InlineData(true, VoiceActivationMode.TalkMode, false, false, true, "speaking")] ++ public void DescribeRecognitionCompletionRestartDecision_ExplainsWhyRestartIsBlocked( ++ bool running, ++ VoiceActivationMode mode, ++ bool restartInProgress, ++ bool awaitingReply, ++ bool isSpeaking, ++ string expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "DescribeRecognitionCompletionRestartDecision", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (string)method.Invoke( ++ null, ++ [running, mode, restartInProgress, awaitingReply, isSpeaking])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, true)] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.Success, false, true, false, false, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, true, false, false, false, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, true, false, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, true, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, true, false)] ++ public void ShouldRebuildRecognitionAfterCompletion_RebuildsOnlyForUserCanceledWithoutActivity( ++ SpeechRecognitionResultStatus status, ++ bool sessionHadActivity, ++ bool sessionHadCaptureSignal, ++ bool restartInProgress, ++ bool awaitingReply, ++ bool isSpeaking, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldRebuildRecognitionAfterCompletion", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke( ++ null, ++ [status, sessionHadActivity, sessionHadCaptureSignal, restartInProgress, awaitingReply, isSpeaking])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, false, "capture-signal-without-recognition")] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, "user-canceled-without-activity")] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, "disabled-official-session-restart-only (status=TimeoutExceeded)")] ++ [InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, "disabled-official-session-restart-only (status=Success)")] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, true, true, false, false, false, "session-had-activity")] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, true, false, false, "controlled-restart-in-progress")] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, true, false, "awaiting-reply")] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, true, "speaking")] ++ public void DescribeRecognitionCompletionRebuildDecision_ExplainsWhyRebuildIsBlocked( ++ SpeechRecognitionResultStatus status, ++ bool sessionHadActivity, ++ bool sessionHadCaptureSignal, ++ bool restartInProgress, ++ bool awaitingReply, ++ bool isSpeaking, ++ string expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "DescribeRecognitionCompletionRebuildDecision", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (string)method.Invoke( ++ null, ++ [status, sessionHadActivity, sessionHadCaptureSignal, restartInProgress, awaitingReply, isSpeaking])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(SpeechRecognitionResultStatus.Success, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false)] ++ [InlineData(SpeechRecognitionResultStatus.UserCanceled, true, false)] ++ [InlineData(SpeechRecognitionResultStatus.GrammarCompilationFailure, false, true)] ++ public void ShouldWarnForRecognitionCompletion_OnlyWarnsForUnexpectedStatuses( ++ SpeechRecognitionResultStatus status, ++ bool rebuildRecognizer, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldWarnForRecognitionCompletion", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke(null, [status, rebuildRecognizer])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(16000, 80, 1280)] ++ [InlineData(16000, 0, 1280)] ++ [InlineData(0, 80, 1280)] ++ [InlineData(48000, 20, 960)] ++ public void ResolveDesiredSamplesPerQuantum_UsesSpeechFriendlyDefaults( ++ int sampleRateHz, ++ int chunkMs, ++ uint expected) ++ { ++ var method = typeof(VoiceCaptureService).GetMethod( ++ "ResolveDesiredSamplesPerQuantum", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (uint)method.Invoke(null, [sampleRateHz, chunkMs])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ public static IEnumerable PeakLevelCases() ++ { ++ yield return [new byte[] { 0, 0, 0, 0 }, 0f]; ++ yield return [new byte[] { 0, 0, 0, 63 }, 0.5f]; ++ yield return [new byte[] { 0, 0, 128, 63, 0, 0, 0, 191 }, 1f]; ++ } ++ ++ [Theory] ++ [MemberData(nameof(PeakLevelCases))] ++ public void ComputePeakLevel_FindsLargestAbsoluteFloatSample(byte[] data, float expected) ++ { ++ var method = typeof(VoiceCaptureService).GetMethod( ++ "ComputePeakLevel", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (float)method.Invoke(null, [data])!; ++ ++ Assert.Equal(expected, result, 3); ++ } ++ ++ [Theory] ++ [InlineData("Now again testing", "again testing", 1, true, "Now again testing")] ++ [InlineData("again testing", "again testing", 1, false, "again testing")] ++ [InlineData("Now again testing", "again testing", 3, false, "again testing")] ++ [InlineData("This is different", "again testing", 1, false, "again testing")] ++ public void SelectRecognizedText_PromotesRecentLongerHypothesisWhenFinalLooksTruncated( ++ string hypothesis, ++ string recognized, ++ int hypothesisAgeSeconds, ++ bool expectedPromoted, ++ string expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "SelectRecognizedText", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ var now = new DateTime(2026, 3, 25, 16, 45, 30, DateTimeKind.Utc); ++ var args = new object?[] { recognized, hypothesis, now.AddSeconds(-hypothesisAgeSeconds), now, null }; ++ ++ var result = (string)method.Invoke(null, args)!; ++ ++ Assert.Equal(expected, result); ++ Assert.Equal(expectedPromoted, (bool)args[4]!); ++ } ++ ++ [Theory] ++ [InlineData(true, "Now again testing", 1, "Now again testing")] ++ [InlineData(true, "Now again testing", 3, null)] ++ [InlineData(false, "Now again testing", 1, null)] ++ [InlineData(true, "", 1, null)] ++ public void SelectCompletionFallbackText_PromotesRecentHypothesisWhenSessionHadActivity( ++ bool sessionHadActivity, ++ string hypothesis, ++ int hypothesisAgeSeconds, ++ string? expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "SelectCompletionFallbackText", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ var now = new DateTime(2026, 3, 25, 21, 36, 35, DateTimeKind.Utc); ++ ++ var result = (string?)method.Invoke( ++ null, ++ [sessionHadActivity, hypothesis, now.AddSeconds(-hypothesisAgeSeconds), now]); ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(false, false, false, true)] ++ [InlineData(true, false, false, false)] ++ [InlineData(false, true, false, false)] ++ [InlineData(false, false, true, false)] ++ public void ShouldClearTranscriptDraftAfterCompletion_ClearsOnlyWhenNoReplyOrFallbackInFlight( ++ bool awaitingReply, ++ bool isSpeaking, ++ bool usedFallbackTranscript, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldClearTranscriptDraftAfterCompletion", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke( ++ null, ++ [awaitingReply, isSpeaking, usedFallbackTranscript])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(true, false, false, false, true)] ++ [InlineData(false, false, false, false, false)] ++ [InlineData(true, true, false, false, false)] ++ [InlineData(true, false, true, false, false)] ++ [InlineData(true, false, false, true, false)] ++ public void ShouldRepromptAfterIncompleteRecognition_OnlyPromptsWhenSpeechWasHeardButNothingUsableSurvived( ++ bool sessionHadActivity, ++ bool awaitingReply, ++ bool isSpeaking, ++ bool usedFallbackTranscript, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldRepromptAfterIncompleteRecognition", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke( ++ null, ++ [sessionHadActivity, awaitingReply, isSpeaking, usedFallbackTranscript])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ [Theory] ++ [InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, true)] ++ [InlineData(true, VoiceActivationMode.TalkMode, "", AudioDeviceRole.Default, true)] ++ [InlineData(true, VoiceActivationMode.TalkMode, "device-1", AudioDeviceRole.Default, false)] ++ [InlineData(true, VoiceActivationMode.VoiceWake, null, AudioDeviceRole.Default, false)] ++ [InlineData(false, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, false)] ++ [InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Communications, false)] ++ public void ShouldRefreshRecognitionForDefaultCaptureDeviceChange_OnlyRefreshesTalkModeUsingSystemDefaultMic( ++ bool running, ++ VoiceActivationMode mode, ++ string? configuredInputDeviceId, ++ AudioDeviceRole role, ++ bool expected) ++ { ++ var method = typeof(VoiceService).GetMethod( ++ "ShouldRefreshRecognitionForDefaultCaptureDeviceChange", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ ++ var result = (bool)method.Invoke(null, [running, mode, configuredInputDeviceId, role])!; ++ ++ Assert.Equal(expected, result); ++ } ++ ++ private static MethodInfo GetMethod() ++ { ++ return typeof(VoiceService).GetMethod( ++ "GetOrCreateTransportReadySource", ++ BindingFlags.NonPublic | BindingFlags.Static)!; ++ } ++} +diff --git a/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs b/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs +new file mode 100644 +index 0000000..c8797af +--- /dev/null ++++ b/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs +@@ -0,0 +1,36 @@ ++using OpenClawTray.Windows; ++ ++namespace OpenClaw.Tray.Tests; ++ ++public class WebChatWindowDomBridgeTests ++{ ++ [Fact] ++ public void BuildDraftScript_ClearsWhenDraftIsBlank() ++ { ++ var script = WebChatWindow.BuildDraftScript(string.Empty); ++ ++ Assert.Equal("window.__openClawTrayVoice?.clearDraft?.();", script); ++ } ++ ++ [Fact] ++ public void BuildTurnsScript_SerializesOutgoingTurns() ++ { ++ var turns = new[] ++ { ++ new WebChatWindow.VoiceConversationTurnMirror("outgoing", "hello from voice") ++ }; ++ ++ var script = WebChatWindow.BuildTurnsScript(turns); ++ ++ Assert.Contains("setTurns", script); ++ Assert.Contains("\"direction\":\"outgoing\"", script); ++ Assert.Contains("\"text\":\"hello from voice\"", script); ++ } ++ ++ [Fact] ++ public void VoiceIntegrationScript_AnchorsTurnsBesideComposer() ++ { ++ Assert.Contains("getTurnsAnchor", WebChatWindow.TrayVoiceIntegrationScript); ++ Assert.Contains("insertBefore(host, anchor)", WebChatWindow.TrayVoiceIntegrationScript); ++ } ++} diff --git a/merge-analysis.txt b/merge-analysis.txt new file mode 100644 index 0000000..ba0e7b8 --- /dev/null +++ b/merge-analysis.txt @@ -0,0 +1,18 @@ +a256a33ad6fa5ebe49a7f6dca52b1992703db37c +100644 98df1e8026dae834daa8d88010057cc8ce2e831b 1 src/OpenClaw.Shared/WindowsNodeClient.cs +100644 0068d68df237a6f35f31590874551a46a62e1bbb 2 src/OpenClaw.Shared/WindowsNodeClient.cs +100644 144e88d3a56273e113a28595804950286af3b053 3 src/OpenClaw.Shared/WindowsNodeClient.cs +100644 de0780f9aa39a6eb8e70382cac0b4d4602e96f97 1 src/OpenClaw.Tray.WinUI/App.xaml.cs +100644 8d0dc13e7597948da2c20f94edb7fd15de0bd5d2 2 src/OpenClaw.Tray.WinUI/App.xaml.cs +100644 76ea7d737152707e43f65f0949d221d9f24aa9a5 3 src/OpenClaw.Tray.WinUI/App.xaml.cs +100644 8e9f269d393e734bbce5aa0b749b7391018cfc2e 1 tests/OpenClaw.Shared.Tests/WindowsNodeClientTests.cs +100644 97765fd9ff56d02861a932454ce31589187c56dd 2 tests/OpenClaw.Shared.Tests/WindowsNodeClientTests.cs +100644 4fdde3a5ab3bfed959edd3bfdb8a9b4969b3ce98 3 tests/OpenClaw.Shared.Tests/WindowsNodeClientTests.cs + +Auto-merging src/OpenClaw.Shared/WebSocketClientBase.cs +Auto-merging src/OpenClaw.Shared/WindowsNodeClient.cs +CONFLICT (content): Merge conflict in src/OpenClaw.Shared/WindowsNodeClient.cs +Auto-merging src/OpenClaw.Tray.WinUI/App.xaml.cs +CONFLICT (content): Merge conflict in src/OpenClaw.Tray.WinUI/App.xaml.cs +Auto-merging tests/OpenClaw.Shared.Tests/WindowsNodeClientTests.cs +CONFLICT (content): Merge conflict in tests/OpenClaw.Shared.Tests/WindowsNodeClientTests.cs diff --git a/moltbot-windows-hub.slnx b/moltbot-windows-hub.slnx index 627f0f5..79eaf12 100644 --- a/moltbot-windows-hub.slnx +++ b/moltbot-windows-hub.slnx @@ -1,5 +1,6 @@ + diff --git a/pr117_diff.txt b/pr117_diff.txt new file mode 100644 index 0000000..1584f40 --- /dev/null +++ b/pr117_diff.txt @@ -0,0 +1,3148 @@ +From c263c5ce18a349379be9aa21c6bdd7de46ad087d Mon Sep 17 00:00:00 2001 +From: sytone +Date: Sat, 28 Mar 2026 15:41:57 -0700 +Subject: [PATCH 1/3] feat: add SSH tunnel configuration and management to + settings + +--- + src/OpenClaw.Shared/SettingsData.cs | 5 + + src/OpenClaw.Tray.WinUI/App.xaml.cs | 62 ++++++- + .../Services/SettingsManager.cs | 25 +++ + .../Services/SshTunnelService.cs | 165 ++++++++++++++++++ + .../Windows/SettingsWindow.xaml | 29 +++ + .../Windows/SettingsWindow.xaml.cs | 150 +++++++++++++++- + .../SettingsRoundTripTests.cs | 20 +++ + 7 files changed, 446 insertions(+), 10 deletions(-) + create mode 100644 src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs + +diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs +index 4c7b075..7a2d4b5 100644 +--- a/src/OpenClaw.Shared/SettingsData.cs ++++ b/src/OpenClaw.Shared/SettingsData.cs +@@ -9,6 +9,11 @@ public class SettingsData + { + public string? GatewayUrl { get; set; } + public string? Token { get; set; } ++ public bool UseSshTunnel { get; set; } = false; ++ public string? SshTunnelUser { get; set; } ++ public string? SshTunnelHost { get; set; } ++ public int SshTunnelRemotePort { get; set; } = 18789; ++ public int SshTunnelLocalPort { get; set; } = 18789; + public bool AutoStart { get; set; } + public bool GlobalHotkeyEnabled { get; set; } = true; + public bool ShowNotifications { get; set; } = true; +diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs +index de0780f..caff372 100644 +--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs +@@ -34,6 +34,7 @@ public partial class App : Application + private TrayIcon? _trayIcon; + private OpenClawGatewayClient? _gatewayClient; + private SettingsManager? _settings; ++ private SshTunnelService? _sshTunnelService; + private GlobalHotkeyService? _globalHotkey; + private System.Timers.Timer? _healthCheckTimer; + private System.Timers.Timer? _sessionPollTimer; +@@ -250,6 +251,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + + // Initialize settings + _settings = new SettingsManager(); ++ _sshTunnelService = new SshTunnelService(new AppLogger()); + + // First-run check + if (string.IsNullOrWhiteSpace(_settings.Token)) +@@ -1080,11 +1082,12 @@ private void BuildTrayMenu(MenuFlyout flyout) + private void InitializeGatewayClient() + { + if (_settings == null) return; ++ if (!EnsureSshTunnelConfigured()) return; + + // Unsubscribe from old client if exists + UnsubscribeGatewayEvents(); + +- _gatewayClient = new OpenClawGatewayClient(_settings.GatewayUrl, _settings.Token, new AppLogger()); ++ _gatewayClient = new OpenClawGatewayClient(_settings.GetEffectiveGatewayUrl(), _settings.Token, new AppLogger()); + _gatewayClient.StatusChanged += OnConnectionStatusChanged; + _gatewayClient.ActivityChanged += OnActivityChanged; + _gatewayClient.NotificationReceived += OnNotificationReceived; +@@ -1121,6 +1124,7 @@ private void InitializeNodeService() + { + if (_settings == null || !_settings.EnableNodeMode) return; + if (_dispatcherQueue == null) return; ++ if (!EnsureSshTunnelConfigured()) return; + + try + { +@@ -1132,7 +1136,7 @@ private void InitializeNodeService() + _nodeService.PairingStatusChanged += OnPairingStatusChanged; + + // Connect to gateway as a node (separate connection from operator) +- _ = _nodeService.ConnectAsync(_settings.GatewayUrl, _settings.Token); ++ _ = _nodeService.ConnectAsync(_settings.GetEffectiveGatewayUrl(), _settings.Token); + } + catch (Exception ex) + { +@@ -1609,6 +1613,10 @@ private void OnSettingsSaved(object? sender, EventArgs e) + var oldNodeService = _nodeService; + _nodeService = null; + try { oldNodeService?.Dispose(); } catch (Exception ex) { Logger.Warn($"Node dispose error: {ex.Message}"); } ++ if (_settings?.UseSshTunnel != true) ++ { ++ _sshTunnelService?.Stop(); ++ } + + if (_settings?.EnableNodeMode == true) + { +@@ -1638,9 +1646,12 @@ private void OnSettingsSaved(object? sender, EventArgs e) + + private void ShowWebChat() + { ++ if (_settings == null) return; ++ if (!EnsureSshTunnelConfigured()) return; ++ + if (_webChatWindow == null || _webChatWindow.IsClosed) + { +- _webChatWindow = new WebChatWindow(_settings!.GatewayUrl, _settings.Token); ++ _webChatWindow = new WebChatWindow(_settings.GetEffectiveGatewayUrl(), _settings.Token); + _webChatWindow.Closed += (s, e) => _webChatWindow = null; + } + _webChatWindow.Activate(); +@@ -1770,8 +1781,9 @@ private void ShowSurfaceImprovementsTipIfNeeded() + private void OpenDashboard(string? path = null) + { + if (_settings == null) return; ++ if (!EnsureSshTunnelConfigured()) return; + +- var baseUrl = _settings.GatewayUrl ++ var baseUrl = _settings.GetEffectiveGatewayUrl() + .Replace("ws://", "http://") + .Replace("wss://", "https://") + .TrimEnd('/'); +@@ -2063,6 +2075,7 @@ private void ExitApplication() + // Unsubscribe and dispose gateway client + UnsubscribeGatewayEvents(); + _gatewayClient?.Dispose(); ++ _sshTunnelService?.Dispose(); + + // Dispose tray and mutex + _trayIcon?.Dispose(); +@@ -2074,6 +2087,47 @@ private void ExitApplication() + Exit(); + } + ++ private bool EnsureSshTunnelConfigured() ++ { ++ if (_settings == null) ++ { ++ return false; ++ } ++ ++ if (_settings.UseSshTunnel) ++ { ++ if (string.IsNullOrWhiteSpace(_settings.SshTunnelUser) || ++ string.IsNullOrWhiteSpace(_settings.SshTunnelHost) || ++ _settings.SshTunnelRemotePort is < 1 or > 65535 || ++ _settings.SshTunnelLocalPort is < 1 or > 65535) ++ { ++ Logger.Warn("SSH tunnel is enabled but settings are incomplete"); ++ _currentStatus = ConnectionStatus.Error; ++ UpdateTrayIcon(); ++ return false; ++ } ++ ++ try ++ { ++ _sshTunnelService ??= new SshTunnelService(new AppLogger()); ++ _sshTunnelService.EnsureStarted(_settings); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error($"Failed to start SSH tunnel: {ex.Message}"); ++ _currentStatus = ConnectionStatus.Error; ++ UpdateTrayIcon(); ++ return false; ++ } ++ } ++ else ++ { ++ _sshTunnelService?.Stop(); ++ } ++ ++ return true; ++ } ++ + #endregion + + private Microsoft.UI.Dispatching.DispatcherQueue? AppDispatcherQueue => +diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +index 0c343f1..5347a6b 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +@@ -19,6 +19,11 @@ public class SettingsManager + // Connection + public string GatewayUrl { get; set; } = "ws://localhost:18789"; + public string Token { get; set; } = ""; ++ public bool UseSshTunnel { get; set; } = false; ++ public string SshTunnelUser { get; set; } = ""; ++ public string SshTunnelHost { get; set; } = ""; ++ public int SshTunnelRemotePort { get; set; } = 18789; ++ public int SshTunnelLocalPort { get; set; } = 18789; + + // Startup + public bool AutoStart { get; set; } = false; +@@ -64,6 +69,11 @@ public void Load() + { + GatewayUrl = loaded.GatewayUrl ?? GatewayUrl; + Token = loaded.Token ?? Token; ++ UseSshTunnel = loaded.UseSshTunnel; ++ SshTunnelUser = loaded.SshTunnelUser ?? SshTunnelUser; ++ SshTunnelHost = loaded.SshTunnelHost ?? SshTunnelHost; ++ SshTunnelRemotePort = loaded.SshTunnelRemotePort <= 0 ? SshTunnelRemotePort : loaded.SshTunnelRemotePort; ++ SshTunnelLocalPort = loaded.SshTunnelLocalPort <= 0 ? SshTunnelLocalPort : loaded.SshTunnelLocalPort; + AutoStart = loaded.AutoStart; + GlobalHotkeyEnabled = loaded.GlobalHotkeyEnabled; + ShowNotifications = loaded.ShowNotifications; +@@ -101,6 +111,11 @@ public void Save() + { + GatewayUrl = GatewayUrl, + Token = Token, ++ UseSshTunnel = UseSshTunnel, ++ SshTunnelUser = SshTunnelUser, ++ SshTunnelHost = SshTunnelHost, ++ SshTunnelRemotePort = SshTunnelRemotePort, ++ SshTunnelLocalPort = SshTunnelLocalPort, + AutoStart = AutoStart, + GlobalHotkeyEnabled = GlobalHotkeyEnabled, + ShowNotifications = ShowNotifications, +@@ -130,4 +145,14 @@ public void Save() + Logger.Error($"Failed to save settings: {ex.Message}"); + } + } ++ ++ public string GetEffectiveGatewayUrl() ++ { ++ if (!UseSshTunnel) ++ { ++ return GatewayUrl; ++ } ++ ++ return $"ws://127.0.0.1:{SshTunnelLocalPort}"; ++ } + } +diff --git a/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs b/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs +new file mode 100644 +index 0000000..18b7764 +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs +@@ -0,0 +1,165 @@ ++using OpenClaw.Shared; ++using System; ++using System.Diagnostics; ++using System.Text; ++ ++namespace OpenClawTray.Services; ++ ++/// ++/// Manages an SSH local port-forward process for gateway access. ++/// ++public sealed class SshTunnelService : IDisposable ++{ ++ private readonly IOpenClawLogger _logger; ++ private Process? _process; ++ private string? _lastSpec; ++ ++ public SshTunnelService(IOpenClawLogger logger) ++ { ++ _logger = logger; ++ } ++ ++ public bool IsRunning => _process is { HasExited: false }; ++ ++ public void EnsureStarted(SettingsManager settings) ++ { ++ if (!settings.UseSshTunnel) ++ { ++ Stop(); ++ return; ++ } ++ ++ EnsureStarted( ++ settings.SshTunnelUser, ++ settings.SshTunnelHost, ++ settings.SshTunnelRemotePort, ++ settings.SshTunnelLocalPort); ++ } ++ ++ public void EnsureStarted(string user, string host, int remotePort, int localPort) ++ { ++ user = user.Trim(); ++ host = host.Trim(); ++ ++ var spec = BuildSpec(user, host, remotePort, localPort); ++ ++ if (IsRunning && string.Equals(_lastSpec, spec, StringComparison.Ordinal)) ++ { ++ return; ++ } ++ ++ Stop(); ++ StartProcess(user, host, remotePort, localPort); ++ _lastSpec = spec; ++ } ++ ++ public void Stop() ++ { ++ if (_process == null) ++ { ++ return; ++ } ++ ++ try ++ { ++ if (!_process.HasExited) ++ { ++ _process.Kill(entireProcessTree: true); ++ _process.WaitForExit(3000); ++ } ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"SSH tunnel stop failed: {ex.Message}"); ++ } ++ finally ++ { ++ try { _process.Dispose(); } catch { } ++ _process = null; ++ _lastSpec = null; ++ } ++ } ++ ++ private void StartProcess(string user, string host, int remotePort, int localPort) ++ { ++ var psi = new ProcessStartInfo ++ { ++ FileName = "ssh", ++ Arguments = BuildArguments(user, host, remotePort, localPort), ++ UseShellExecute = false, ++ RedirectStandardOutput = true, ++ RedirectStandardError = true, ++ CreateNoWindow = true, ++ }; ++ ++ var process = new Process ++ { ++ StartInfo = psi, ++ EnableRaisingEvents = true, ++ }; ++ ++ process.OutputDataReceived += (_, e) => ++ { ++ if (!string.IsNullOrWhiteSpace(e.Data)) ++ { ++ _logger.Info($"[SSH] {e.Data}"); ++ } ++ }; ++ ++ process.ErrorDataReceived += (_, e) => ++ { ++ if (!string.IsNullOrWhiteSpace(e.Data)) ++ { ++ _logger.Warn($"[SSH] {e.Data}"); ++ } ++ }; ++ ++ process.Exited += (_, _) => ++ { ++ var exitCode = process.ExitCode; ++ _logger.Warn($"SSH tunnel exited (code {exitCode})"); ++ }; ++ ++ try ++ { ++ if (!process.Start()) ++ { ++ throw new InvalidOperationException("Failed to start ssh process"); ++ } ++ } ++ catch (Exception ex) ++ { ++ process.Dispose(); ++ throw new InvalidOperationException("Unable to start SSH tunnel process. Ensure OpenSSH client is installed and available in PATH.", ex); ++ } ++ ++ process.BeginOutputReadLine(); ++ process.BeginErrorReadLine(); ++ _process = process; ++ ++ _logger.Info($"SSH tunnel started: 127.0.0.1:{localPort} -> 127.0.0.1:{remotePort} via {user}@{host}"); ++ } ++ ++ private static string BuildSpec(string user, string host, int remotePort, int localPort) ++ => $"{user}@{host}:{localPort}:{remotePort}"; ++ ++ private static string BuildArguments(string user, string host, int remotePort, int localPort) ++ { ++ var sb = new StringBuilder(); ++ sb.Append("-N "); ++ sb.Append("-L "); ++ sb.Append(localPort); ++ sb.Append(":127.0.0.1:"); ++ sb.Append(remotePort); ++ sb.Append(' '); ++ sb.Append(user); ++ sb.Append('@'); ++ sb.Append(host); ++ return sb.ToString(); ++ } ++ ++ public void Dispose() ++ { ++ Stop(); ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml +index f8631f5..e0f15ac 100644 +--- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml ++++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml +@@ -25,6 +25,35 @@ + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + +diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs +index e4224a8..2308c6f 100644 +--- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs +@@ -12,6 +12,7 @@ namespace OpenClawTray.Windows; + public sealed partial class SettingsWindow : WindowEx + { + private readonly SettingsManager _settings; ++ private string _manualGatewayUrl = ""; + public bool IsClosed { get; private set; } + + public event EventHandler? SettingsSaved; +@@ -37,7 +38,14 @@ public SettingsWindow(SettingsManager settings) + + private void LoadSettings() + { ++ UseSshTunnelToggle.IsOn = _settings.UseSshTunnel; ++ SshTunnelUserTextBox.Text = _settings.SshTunnelUser; ++ SshTunnelHostTextBox.Text = _settings.SshTunnelHost; ++ SshTunnelRemotePortTextBox.Text = _settings.SshTunnelRemotePort.ToString(); ++ SshTunnelLocalPortTextBox.Text = _settings.SshTunnelLocalPort.ToString(); ++ _manualGatewayUrl = _settings.GatewayUrl; + GatewayUrlTextBox.Text = _settings.GatewayUrl; ++ UpdateSshTunnelUiState(); + TokenTextBox.Text = _settings.Token; + AutoStartToggle.IsOn = _settings.AutoStart; + GlobalHotkeyToggle.IsOn = _settings.GlobalHotkeyEnabled; +@@ -72,7 +80,16 @@ private void LoadSettings() + + private void SaveSettings() + { +- _settings.GatewayUrl = GatewayUrlTextBox.Text.Trim(); ++ _settings.UseSshTunnel = UseSshTunnelToggle.IsOn; ++ _settings.SshTunnelUser = SshTunnelUserTextBox.Text.Trim(); ++ _settings.SshTunnelHost = SshTunnelHostTextBox.Text.Trim(); ++ _settings.SshTunnelRemotePort = ParsePortOrDefault(SshTunnelRemotePortTextBox.Text, _settings.SshTunnelRemotePort); ++ _settings.SshTunnelLocalPort = ParsePortOrDefault(SshTunnelLocalPortTextBox.Text, _settings.SshTunnelLocalPort); ++ if (!_settings.UseSshTunnel) ++ { ++ _settings.GatewayUrl = GatewayUrlTextBox.Text.Trim(); ++ _manualGatewayUrl = _settings.GatewayUrl; ++ } + _settings.Token = TokenTextBox.Text.Trim(); + _settings.AutoStart = AutoStartToggle.IsOn; + _settings.GlobalHotkeyEnabled = GlobalHotkeyToggle.IsOn; +@@ -101,13 +118,26 @@ private void SaveSettings() + + private async void OnTestConnection(object sender, RoutedEventArgs e) + { ++ var useSshTunnel = UseSshTunnelToggle.IsOn; ++ var sshUser = ""; ++ var sshHost = ""; ++ var remotePort = 0; ++ var localPort = 0; ++ SshTunnelService? testTunnel = null; ++ + var gatewayUrl = GatewayUrlTextBox.Text.Trim(); +- if (!GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl)) ++ if (!useSshTunnel && !GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl)) + { + StatusLabel.Text = $"❌ {GatewayUrlHelper.ValidationMessage}"; + return; + } + ++ if (useSshTunnel && !TryReadTunnelSettings(out sshUser, out sshHost, out remotePort, out localPort, out var tunnelError)) ++ { ++ StatusLabel.Text = $"❌ {tunnelError}"; ++ return; ++ } ++ + Logger.Info("[Settings] Test connection initiated"); + StatusLabel.Text = LocalizationHelper.GetString("Status_Testing"); + TestConnectionButton.IsEnabled = false; +@@ -115,8 +145,15 @@ private async void OnTestConnection(object sender, RoutedEventArgs e) + try + { + var testLogger = new TestLogger(); ++ if (useSshTunnel) ++ { ++ testTunnel = new SshTunnelService(testLogger); ++ Logger.Info($"[Settings] Starting temporary SSH tunnel for test: {sshUser}@{sshHost} local:{localPort} remote:{remotePort}"); ++ testTunnel.EnsureStarted(sshUser, sshHost, remotePort, localPort); ++ } ++ + var client = new OpenClawGatewayClient( +- gatewayUrl, ++ useSshTunnel ? $"ws://127.0.0.1:{localPort}" : gatewayUrl, + TokenTextBox.Text.Trim(), + testLogger); + +@@ -167,6 +204,7 @@ private async void OnTestConnection(object sender, RoutedEventArgs e) + } + finally + { ++ testTunnel?.Dispose(); + TestConnectionButton.IsEnabled = true; + } + } +@@ -188,14 +226,22 @@ private void OnTestNotification(object sender, RoutedEventArgs e) + + private void OnSave(object sender, RoutedEventArgs e) + { ++ var useSshTunnel = UseSshTunnelToggle.IsOn; + var gatewayUrl = GatewayUrlTextBox.Text.Trim(); +- if (!GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl)) ++ if (!useSshTunnel && !GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl)) + { + Logger.Warn($"[Settings] Save blocked — invalid gateway URL"); + StatusLabel.Text = $"❌ {GatewayUrlHelper.ValidationMessage}"; + return; + } + ++ if (useSshTunnel && !TryReadTunnelSettings(out _, out _, out _, out _, out var tunnelError)) ++ { ++ Logger.Warn("[Settings] Save blocked — invalid SSH tunnel settings"); ++ StatusLabel.Text = $"❌ {tunnelError}"; ++ return; ++ } ++ + // Log key setting changes before saving + var oldGateway = _settings.GatewayUrl; + var oldAutoStart = _settings.AutoStart; +@@ -220,6 +266,96 @@ private void OnCancel(object sender, RoutedEventArgs e) + Close(); + } + ++ private static int ParsePortOrDefault(string? value, int fallback) ++ { ++ if (int.TryParse(value?.Trim(), out var parsed) && parsed is >= 1 and <= 65535) ++ { ++ return parsed; ++ } ++ ++ return fallback; ++ } ++ ++ private bool TryReadTunnelSettings( ++ out string user, ++ out string host, ++ out int remotePort, ++ out int localPort, ++ out string? error) ++ { ++ user = SshTunnelUserTextBox.Text.Trim(); ++ host = SshTunnelHostTextBox.Text.Trim(); ++ remotePort = 0; ++ localPort = 0; ++ error = null; ++ ++ if (string.IsNullOrWhiteSpace(user)) ++ { ++ error = "SSH User is required when tunnel mode is enabled."; ++ return false; ++ } ++ ++ if (string.IsNullOrWhiteSpace(host)) ++ { ++ error = "SSH Host is required when tunnel mode is enabled."; ++ return false; ++ } ++ ++ if (!int.TryParse(SshTunnelRemotePortTextBox.Text.Trim(), out remotePort) || remotePort is < 1 or > 65535) ++ { ++ error = "Remote Gateway Port must be a number from 1 to 65535."; ++ return false; ++ } ++ ++ if (!int.TryParse(SshTunnelLocalPortTextBox.Text.Trim(), out localPort) || localPort is < 1 or > 65535) ++ { ++ error = "Local Forward Port must be a number from 1 to 65535."; ++ return false; ++ } ++ ++ return true; ++ } ++ ++ private void OnUseSshTunnelToggled(object sender, RoutedEventArgs e) ++ { ++ UpdateSshTunnelUiState(); ++ } ++ ++ private void OnSshTunnelLocalPortTextChanged(object sender, Microsoft.UI.Xaml.Controls.TextChangedEventArgs e) ++ { ++ if (UseSshTunnelToggle.IsOn) ++ { ++ UpdateSshTunnelUiState(); ++ } ++ } ++ ++ private void UpdateSshTunnelUiState() ++ { ++ var useSshTunnel = UseSshTunnelToggle.IsOn; ++ var wasReadOnly = GatewayUrlTextBox.IsReadOnly; ++ ++ SshTunnelDetailsPanel.Visibility = useSshTunnel ? Visibility.Visible : Visibility.Collapsed; ++ GatewayUrlTextBox.IsReadOnly = useSshTunnel; ++ ++ if (useSshTunnel) ++ { ++ if (!wasReadOnly) ++ { ++ _manualGatewayUrl = GatewayUrlTextBox.Text.Trim(); ++ } ++ ++ var localPort = ParsePortOrDefault(SshTunnelLocalPortTextBox.Text, 18789); ++ GatewayUrlTextBox.Text = $"ws://127.0.0.1:{localPort}"; ++ } ++ else ++ { ++ if (GatewayUrlTextBox.Text.StartsWith("ws://127.0.0.1:", StringComparison.OrdinalIgnoreCase)) ++ { ++ GatewayUrlTextBox.Text = _manualGatewayUrl; ++ } ++ } ++ } ++ + private class TestLogger : IOpenClawLogger + { + public string? LastError { get; private set; } +@@ -233,8 +369,10 @@ public void Warn(string message) + } + public void Error(string message, Exception? ex = null) + { +- LastError = message; +- Logger.Error($"[Settings:TestClient] {message}"); ++ LastError = ex != null ++ ? $"{message}: {ex.Message}" ++ : message; ++ Logger.Error($"[Settings:TestClient] {LastError}"); + } + } + } +diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +index 8b09519..887df5b 100644 +--- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs ++++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +@@ -12,6 +12,11 @@ public void RoundTrip_AllFields_Preserved() + { + GatewayUrl = "ws://localhost:18789", + Token = "secret-token", ++ UseSshTunnel = true, ++ SshTunnelUser = "user1", ++ SshTunnelHost = "remote-host", ++ SshTunnelRemotePort = 18789, ++ SshTunnelLocalPort = 28789, + AutoStart = true, + GlobalHotkeyEnabled = false, + ShowNotifications = true, +@@ -40,6 +45,11 @@ public void RoundTrip_AllFields_Preserved() + Assert.NotNull(restored); + Assert.Equal(original.GatewayUrl, restored.GatewayUrl); + Assert.Equal(original.Token, restored.Token); ++ Assert.Equal(original.UseSshTunnel, restored.UseSshTunnel); ++ Assert.Equal(original.SshTunnelUser, restored.SshTunnelUser); ++ Assert.Equal(original.SshTunnelHost, restored.SshTunnelHost); ++ Assert.Equal(original.SshTunnelRemotePort, restored.SshTunnelRemotePort); ++ Assert.Equal(original.SshTunnelLocalPort, restored.SshTunnelLocalPort); + Assert.Equal(original.AutoStart, restored.AutoStart); + Assert.Equal(original.GlobalHotkeyEnabled, restored.GlobalHotkeyEnabled); + Assert.Equal(original.ShowNotifications, restored.ShowNotifications); +@@ -85,6 +95,11 @@ public void MissingFields_UseDefaults() + Assert.NotNull(settings); + Assert.Null(settings.GatewayUrl); + Assert.Null(settings.Token); ++ Assert.False(settings.UseSshTunnel); ++ Assert.Null(settings.SshTunnelUser); ++ Assert.Null(settings.SshTunnelHost); ++ Assert.Equal(18789, settings.SshTunnelRemotePort); ++ Assert.Equal(18789, settings.SshTunnelLocalPort); + Assert.False(settings.AutoStart); + Assert.True(settings.GlobalHotkeyEnabled); + Assert.True(settings.ShowNotifications); +@@ -131,6 +146,11 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() + Assert.NotNull(settings); + Assert.Equal("ws://localhost:18789", settings.GatewayUrl); + Assert.Equal("abc", settings.Token); ++ Assert.False(settings.UseSshTunnel); ++ Assert.Null(settings.SshTunnelUser); ++ Assert.Null(settings.SshTunnelHost); ++ Assert.Equal(18789, settings.SshTunnelRemotePort); ++ Assert.Equal(18789, settings.SshTunnelLocalPort); + // New fields should have sensible defaults + Assert.True(settings.NotifyChatResponses); + Assert.True(settings.PreferStructuredCategories); + +From 98f48c2ef96656ce622f28f92a7eb15e0359f134 Mon Sep 17 00:00:00 2001 +From: sytone +Date: Sat, 28 Mar 2026 18:22:15 -0700 +Subject: [PATCH 2/3] feat: Add SkippedUpdateTag to settings and enhance update + handling + +- Introduced SkippedUpdateTag property in SettingsData and SettingsManager to remember skipped updates. +- Updated App.xaml.cs to initialize settings before update checks and handle skipped updates. +- Enhanced QuickSendDialog to provide detailed error messages and focus handling. +- Improved WebSocketClientBase with better auto-reconnect logic and error handling. +- Added integration tests for DeviceIdentity payload formats and OpenClawGatewayClient response handling. +- Updated SettingsRoundTripTests to validate SkippedUpdateTag persistence. +--- + AGENTS.md | 25 + + README.md | 35 +- + build.ps1 | 7 +- + moltbot-windows-hub.slnx | 1 + + src/OpenClaw.Cli/OpenClaw.Cli.csproj | 12 + + src/OpenClaw.Cli/Program.cs | 300 +++++++++++ + src/OpenClaw.Shared/DeviceIdentity.cs | 112 +++++ + src/OpenClaw.Shared/OpenClawGatewayClient.cs | 465 +++++++++++++++++- + src/OpenClaw.Shared/SettingsData.cs | 1 + + src/OpenClaw.Shared/WebSocketClientBase.cs | 72 ++- + src/OpenClaw.Tray.WinUI/App.xaml.cs | 167 ++++++- + .../Dialogs/QuickSendDialog.cs | 205 +++++++- + .../Services/SettingsManager.cs | 3 + + .../Services/SshTunnelService.cs | 14 +- + .../DeviceIdentityTests.cs | 60 +++ + .../OpenClawGatewayClientTests.cs | 141 ++++++ + .../WebSocketClientBaseTests.cs | 4 +- + .../SettingsRoundTripTests.cs | 4 + + 18 files changed, 1553 insertions(+), 75 deletions(-) + create mode 100644 AGENTS.md + create mode 100644 src/OpenClaw.Cli/OpenClaw.Cli.csproj + create mode 100644 src/OpenClaw.Cli/Program.cs + +diff --git a/AGENTS.md b/AGENTS.md +new file mode 100644 +index 0000000..07e3ac6 +--- /dev/null ++++ b/AGENTS.md +@@ -0,0 +1,25 @@ ++# AGENTS.md ++ ++## Required Validation After Every Change ++ ++All agents working in this repository must run validation after each code change before marking work complete. ++ ++Required steps: ++ ++1. Run full repo build: ++ - `./build.ps1` ++2. Run shared tests: ++ - `dotnet test ./tests/OpenClaw.Shared.Tests/OpenClaw.Shared.Tests.csproj --no-restore` ++3. Run tray tests: ++ - `dotnet test ./tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj --no-restore` ++ ++If a command fails: ++ ++1. Fix the issue. ++2. Re-run the failed command. ++3. Re-run all required validation commands before completion. ++ ++Notes: ++ ++- If a build/test is blocked by an environmental lock (for example running executable locking output assemblies), stop/close the locking process and rerun. ++- Do not claim completion without reporting validation results. +diff --git a/README.md b/README.md +index b0c3e40..d74620a 100644 +--- a/README.md ++++ b/README.md +@@ -10,12 +10,13 @@ A Windows companion suite for [OpenClaw](https://openclaw.ai) - the AI-powered p + + ## Projects + +-This monorepo contains three projects: ++This monorepo contains four projects: + + | Project | Description | + |---------|-------------| + | **OpenClaw.Tray.WinUI** | System tray application (WinUI 3) for quick access to OpenClaw | + | **OpenClaw.Shared** | Shared gateway client library | ++| **OpenClaw.Cli** | CLI validator for WebSocket connect/send/probe using tray settings | + | **OpenClaw.CommandPalette** | PowerToys Command Palette extension | + + ## 🚀 Quick Start +@@ -65,6 +66,24 @@ dotnet build src/OpenClaw.Tray.WinUI -r win-x64 -p:PackageMsix=true # x64 MSI + .\src\OpenClaw.Tray.WinUI\bin\Debug\net10.0-windows10.0.19041.0\win-x64\OpenClaw.Tray.WinUI.exe # x64 + ``` + ++### Run CLI WebSocket Validator ++ ++Use the CLI to validate gateway connectivity and `chat.send` outside the tray UI. ++ ++```powershell ++# Show help ++dotnet run --project src/OpenClaw.Cli -- --help ++ ++# Use tray settings from %APPDATA%\OpenClawTray\settings.json and send one message ++dotnet run --project src/OpenClaw.Cli -- --message "quick send validation" ++ ++# Loop sends and also probe sessions/usage/nodes APIs ++dotnet run --project src/OpenClaw.Cli -- --repeat 5 --delay-ms 1000 --probe-read --verbose ++ ++# Override gateway URL/token for isolated testing ++dotnet run --project src/OpenClaw.Cli -- --url ws://127.0.0.1:18789 --token "" --message "override test" ++``` ++ + ## 📦 OpenClaw.Tray (Molty) + + Modern Windows 11-style system tray companion that connects to your local OpenClaw gateway. +@@ -85,6 +104,20 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl + - ⚙️ **Settings** - Full configuration dialog + - 🎯 **First-run experience** - Welcome dialog guides new users + ++#### Quick Send scope requirement ++ ++Quick Send uses the gateway `chat.send` method and requires the operator device to have `operator.write` scope. ++ ++If Quick Send fails with `missing scope: operator.write`, Molty now copies identity + remediation guidance to your clipboard, including: ++ ++- operator role and `client.id` used by the tray app ++- gateway-reported operator device id (if provided) ++- currently granted scopes (if provided) ++ ++For this specific error (`missing scope: operator.write`), the cause is an **operator token scope issue**. Update the token used by the tray app so it includes `operator.write`, then retry Quick Send. ++ ++If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device approval** issue. Approve the tray device in gateway pairing approvals, reconnect, and retry. ++ + ### Menu Sections + - **Status** - Gateway connection status with click-to-view details + - **Sessions** - Active agent sessions with preview and per-session controls +diff --git a/build.ps1 b/build.ps1 +index bd24b54..13cb7e8 100644 +--- a/build.ps1 ++++ b/build.ps1 +@@ -6,7 +6,7 @@ + Builds all projects, checks prerequisites, and provides clear guidance. + + .PARAMETER Project +- Which project to build: All, Tray, WinUI, Shared, CommandPalette ++ Which project to build: All, Tray, WinUI, Shared, CommandPalette, Cli + Default: All + + .PARAMETER Configuration +@@ -23,7 +23,7 @@ + #> + + param( +- [ValidateSet("All", "Tray", "WinUI", "Shared", "CommandPalette")] ++ [ValidateSet("All", "Tray", "WinUI", "Shared", "CommandPalette", "Cli")] + [string]$Project = "All", + + [ValidateSet("Debug", "Release")] +@@ -187,12 +187,13 @@ function Build-Project($name, $path, $useRid = $false) { + + $projects = @{ + "Shared" = @{ Path = "src/OpenClaw.Shared/OpenClaw.Shared.csproj"; UseRid = $false } ++ "Cli" = @{ Path = "src/OpenClaw.Cli/OpenClaw.Cli.csproj"; UseRid = $false } + "Tray" = @{ Path = "src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj"; UseRid = $true } + "WinUI" = @{ Path = "src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj"; UseRid = $true } + "CommandPalette" = @{ Path = "src/OpenClaw.CommandPalette/OpenClaw.CommandPalette.csproj"; UseRid = $false } + } + +-$toBuild = if ($Project -eq "All") { @("Shared", "WinUI") } else { @($Project) } ++$toBuild = if ($Project -eq "All") { @("Shared", "Cli", "WinUI") } else { @($Project) } + + # Always build Shared first if building other projects + if ($Project -ne "Shared" -and $Project -ne "All" -and $toBuild -notcontains "Shared") { +diff --git a/moltbot-windows-hub.slnx b/moltbot-windows-hub.slnx +index 627f0f5..79eaf12 100644 +--- a/moltbot-windows-hub.slnx ++++ b/moltbot-windows-hub.slnx +@@ -1,5 +1,6 @@ + + ++ + + + +diff --git a/src/OpenClaw.Cli/OpenClaw.Cli.csproj b/src/OpenClaw.Cli/OpenClaw.Cli.csproj +new file mode 100644 +index 0000000..2eecce4 +--- /dev/null ++++ b/src/OpenClaw.Cli/OpenClaw.Cli.csproj +@@ -0,0 +1,12 @@ ++ ++ ++ Exe ++ net10.0 ++ enable ++ enable ++ ++ ++ ++ ++ ++ +diff --git a/src/OpenClaw.Cli/Program.cs b/src/OpenClaw.Cli/Program.cs +new file mode 100644 +index 0000000..7fb544c +--- /dev/null ++++ b/src/OpenClaw.Cli/Program.cs +@@ -0,0 +1,300 @@ ++using System.Globalization; ++using System.Text; ++using OpenClaw.Shared; ++ ++internal sealed class CliOptions ++{ ++ public string SettingsPath { get; set; } = Path.Combine( ++ Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), ++ "OpenClawTray", ++ "settings.json"); ++ ++ public string? GatewayUrlOverride { get; set; } ++ public string? TokenOverride { get; set; } ++ public string Message { get; set; } = "openclaw-cli validation ping"; ++ public int Repeat { get; set; } = 1; ++ public int DelayMs { get; set; } = 500; ++ public int ConnectTimeoutMs { get; set; } = 10000; ++ public bool ProbeReadApis { get; set; } ++ public bool Verbose { get; set; } ++} ++ ++internal static class Program ++{ ++ private static async Task Main(string[] args) ++ { ++ if (args.Any(a => a is "--help" or "-h")) ++ { ++ PrintUsage(); ++ return 0; ++ } ++ ++ CliOptions options; ++ try ++ { ++ options = ParseArgs(args); ++ } ++ catch (Exception ex) ++ { ++ Console.Error.WriteLine($"Argument error: {ex.Message}"); ++ PrintUsage(); ++ return 2; ++ } ++ ++ var (gatewayUrl, token, loaded) = LoadConnectionFromSettings(options); ++ if (string.IsNullOrWhiteSpace(gatewayUrl)) ++ { ++ Console.Error.WriteLine("Gateway URL is missing. Set it in tray settings or pass --url."); ++ return 2; ++ } ++ ++ if (string.IsNullOrWhiteSpace(token)) ++ { ++ Console.Error.WriteLine("Token is missing. Set it in tray settings or pass --token."); ++ return 2; ++ } ++ ++ Console.WriteLine($"Settings file: {options.SettingsPath}"); ++ Console.WriteLine($"Gateway URL: {GatewayUrlHelper.SanitizeForDisplay(gatewayUrl)}"); ++ Console.WriteLine($"Token source: {(options.TokenOverride is null ? "settings" : "--token override")}"); ++ if (loaded is not null) ++ { ++ Console.WriteLine($"Node mode in settings: {loaded.EnableNodeMode}"); ++ Console.WriteLine($"SSH tunnel in settings: {loaded.UseSshTunnel} (local port {loaded.SshTunnelLocalPort})"); ++ } ++ ++ IOpenClawLogger logger = options.Verbose ? new ConsoleLogger() : NullLogger.Instance; ++ using var client = new OpenClawGatewayClient(gatewayUrl, token, logger); ++ ++ var lastStatus = ConnectionStatus.Disconnected; ++ var connectedTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ var errorTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ ++ client.StatusChanged += (_, status) => ++ { ++ lastStatus = status; ++ Console.WriteLine($"Status: {status}"); ++ if (status == ConnectionStatus.Connected) ++ { ++ connectedTcs.TrySetResult(true); ++ } ++ else if (status == ConnectionStatus.Error) ++ { ++ errorTcs.TrySetResult(true); ++ } ++ }; ++ ++ client.SessionsUpdated += (_, sessions) => Console.WriteLine($"sessions.list -> {sessions.Length} session(s)"); ++ client.UsageUpdated += (_, usage) => Console.WriteLine($"usage -> tokens {usage.TotalTokens}, requests {usage.RequestCount}, cost ${usage.CostUsd:F4}"); ++ client.NodesUpdated += (_, nodes) => Console.WriteLine($"node.list -> {nodes.Length} node(s)"); ++ ++ Console.WriteLine("Connecting..."); ++ await client.ConnectAsync(); ++ ++ var connected = await WaitForConnectedAsync(connectedTcs.Task, errorTcs.Task, options.ConnectTimeoutMs); ++ if (!connected) ++ { ++ Console.Error.WriteLine($"Connection did not reach Connected within {options.ConnectTimeoutMs}ms (last status: {lastStatus})."); ++ return 1; ++ } ++ ++ Console.WriteLine($"Connected. Device ID: {client.OperatorDeviceId ?? "(unknown)"}"); ++ Console.WriteLine($"Granted scopes: {string.Join(", ", client.GrantedOperatorScopes)}"); ++ ++ if (options.ProbeReadApis) ++ { ++ Console.WriteLine("Probing read APIs (sessions/usage/nodes)..."); ++ await client.RequestSessionsAsync(); ++ await client.RequestUsageAsync(); ++ await client.RequestNodesAsync(); ++ await Task.Delay(1200); ++ } ++ ++ var failures = 0; ++ for (var i = 1; i <= options.Repeat; i++) ++ { ++ var message = options.Repeat == 1 ++ ? options.Message ++ : $"{options.Message} [attempt {i}/{options.Repeat}]"; ++ ++ try ++ { ++ Console.WriteLine($"chat.send #{i} -> \"{message}\""); ++ await client.SendChatMessageAsync(message); ++ Console.WriteLine($"chat.send #{i} OK"); ++ } ++ catch (Exception ex) ++ { ++ failures++; ++ Console.Error.WriteLine($"chat.send #{i} FAILED: {ex.Message}"); ++ } ++ ++ if (i < options.Repeat) ++ { ++ await Task.Delay(options.DelayMs); ++ } ++ } ++ ++ if (failures > 0) ++ { ++ Console.Error.WriteLine($"Completed with {failures} failed send(s)."); ++ return 1; ++ } ++ ++ Console.WriteLine("All sends succeeded."); ++ return 0; ++ } ++ ++ private static async Task WaitForConnectedAsync(Task connected, Task error, int timeoutMs) ++ { ++ using var timeoutCts = new CancellationTokenSource(timeoutMs); ++ var timeoutTask = Task.Delay(Timeout.InfiniteTimeSpan, timeoutCts.Token); ++ ++ var completed = await Task.WhenAny(connected, error, timeoutTask); ++ if (completed == connected) ++ { ++ return true; ++ } ++ ++ return false; ++ } ++ ++ private static (string GatewayUrl, string Token, SettingsData? Loaded) LoadConnectionFromSettings(CliOptions options) ++ { ++ var loaded = LoadSettings(options.SettingsPath); ++ ++ var gatewayUrl = options.GatewayUrlOverride; ++ if (string.IsNullOrWhiteSpace(gatewayUrl)) ++ { ++ gatewayUrl = BuildEffectiveGatewayUrl(loaded); ++ } ++ ++ var token = options.TokenOverride; ++ if (string.IsNullOrWhiteSpace(token)) ++ { ++ token = loaded?.Token; ++ } ++ ++ return (gatewayUrl ?? string.Empty, token ?? string.Empty, loaded); ++ } ++ ++ private static SettingsData? LoadSettings(string path) ++ { ++ if (!File.Exists(path)) ++ { ++ throw new FileNotFoundException("Settings file not found", path); ++ } ++ ++ var json = File.ReadAllText(path, Encoding.UTF8); ++ var settings = SettingsData.FromJson(json); ++ if (settings is null) ++ { ++ throw new InvalidOperationException("Settings JSON could not be parsed"); ++ } ++ ++ return settings; ++ } ++ ++ private static string? BuildEffectiveGatewayUrl(SettingsData? settings) ++ { ++ if (settings is null) ++ { ++ return null; ++ } ++ ++ if (!settings.UseSshTunnel) ++ { ++ return settings.GatewayUrl; ++ } ++ ++ var port = settings.SshTunnelLocalPort <= 0 ? 18789 : settings.SshTunnelLocalPort; ++ return $"ws://127.0.0.1:{port}"; ++ } ++ ++ private static CliOptions ParseArgs(string[] args) ++ { ++ var options = new CliOptions(); ++ ++ for (var i = 0; i < args.Length; i++) ++ { ++ var arg = args[i]; ++ switch (arg) ++ { ++ case "--settings": ++ options.SettingsPath = RequireValue(args, ref i, arg); ++ break; ++ case "--url": ++ options.GatewayUrlOverride = RequireValue(args, ref i, arg); ++ break; ++ case "--token": ++ options.TokenOverride = RequireValue(args, ref i, arg); ++ break; ++ case "--message": ++ options.Message = RequireValue(args, ref i, arg); ++ break; ++ case "--repeat": ++ options.Repeat = ParseInt(RequireValue(args, ref i, arg), min: 1, name: arg); ++ break; ++ case "--delay-ms": ++ options.DelayMs = ParseInt(RequireValue(args, ref i, arg), min: 0, name: arg); ++ break; ++ case "--connect-timeout-ms": ++ options.ConnectTimeoutMs = ParseInt(RequireValue(args, ref i, arg), min: 1000, name: arg); ++ break; ++ case "--probe-read": ++ options.ProbeReadApis = true; ++ break; ++ case "--verbose": ++ options.Verbose = true; ++ break; ++ default: ++ throw new ArgumentException($"Unknown argument: {arg}"); ++ } ++ } ++ ++ return options; ++ } ++ ++ private static string RequireValue(string[] args, ref int index, string name) ++ { ++ if (index + 1 >= args.Length) ++ { ++ throw new ArgumentException($"Missing value for {name}"); ++ } ++ ++ index++; ++ return args[index]; ++ } ++ ++ private static int ParseInt(string value, int min, string name) ++ { ++ if (!int.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed) || parsed < min) ++ { ++ throw new ArgumentException($"{name} must be an integer >= {min}"); ++ } ++ ++ return parsed; ++ } ++ ++ private static void PrintUsage() ++ { ++ Console.WriteLine("OpenClaw CLI WebSocket validator"); ++ Console.WriteLine(); ++ Console.WriteLine("Reads the same tray settings file and runs chat.send checks over gateway WebSocket."); ++ Console.WriteLine(); ++ Console.WriteLine("Usage:"); ++ Console.WriteLine(" dotnet run --project src/OpenClaw.Cli -- [options]"); ++ Console.WriteLine(); ++ Console.WriteLine("Options:"); ++ Console.WriteLine(" --settings Settings file (default: %APPDATA%\\OpenClawTray\\settings.json)"); ++ Console.WriteLine(" --url Override gateway URL"); ++ Console.WriteLine(" --token Override token"); ++ Console.WriteLine(" --message Message to send"); ++ Console.WriteLine(" --repeat Number of sends (default: 1)"); ++ Console.WriteLine(" --delay-ms Delay between sends (default: 500)"); ++ Console.WriteLine(" --connect-timeout-ms Wait for Connected state (default: 10000)"); ++ Console.WriteLine(" --probe-read Request sessions/usage/nodes once"); ++ Console.WriteLine(" --verbose Enable shared client console logs"); ++ Console.WriteLine(" --help, -h Show this help"); ++ } ++} +diff --git a/src/OpenClaw.Shared/DeviceIdentity.cs b/src/OpenClaw.Shared/DeviceIdentity.cs +index 1e96a46..ff2f07b 100644 +--- a/src/OpenClaw.Shared/DeviceIdentity.cs ++++ b/src/OpenClaw.Shared/DeviceIdentity.cs +@@ -134,6 +134,118 @@ public string SignPayload(string nonce, long signedAtMs, string clientId, string + // Return base64url encoded signature + return Base64UrlEncode(signature); + } ++ ++ /// ++ /// Sign a v3 connect payload for operator/client connections. ++ /// Format: v3|{deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{tokenOrEmpty}|{nonce}|{platform}|{deviceFamily} ++ /// ++ public string SignConnectPayloadV3( ++ string nonce, ++ long signedAtMs, ++ string clientId, ++ string clientMode, ++ string role, ++ IEnumerable scopes, ++ string authToken, ++ string platform, ++ string deviceFamily) ++ { ++ if (_privateKey == null) ++ throw new InvalidOperationException("Device not initialized"); ++ ++ var payload = BuildConnectPayloadV3( ++ nonce, ++ signedAtMs, ++ clientId, ++ clientMode, ++ role, ++ scopes, ++ authToken, ++ platform, ++ deviceFamily); ++ ++ var dataBytes = Encoding.UTF8.GetBytes(payload); ++ var signature = Ed25519Algorithm.Sign(_privateKey, dataBytes); ++ return Base64UrlEncode(signature); ++ } ++ ++ /// ++ /// Build the v3 connect payload string for signing/debugging. ++ /// Format: v3|{deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{tokenOrEmpty}|{nonce}|{platform}|{deviceFamily} ++ /// ++ public string BuildConnectPayloadV3( ++ string nonce, ++ long signedAtMs, ++ string clientId, ++ string clientMode, ++ string role, ++ IEnumerable scopes, ++ string authToken, ++ string platform, ++ string deviceFamily) ++ { ++ if (_deviceId == null) ++ throw new InvalidOperationException("Device not initialized"); ++ ++ var scopesCsv = string.Join(",", scopes ?? Array.Empty()); ++ var safeToken = authToken ?? string.Empty; ++ var safeNonce = nonce ?? string.Empty; ++ ++ return $"v3|{_deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{safeToken}|{safeNonce}|{platform}|{deviceFamily}"; ++ } ++ ++ /// ++ /// Sign a v2 connect payload for compatibility mode. ++ /// Format: v2|{deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{tokenOrEmpty}|{nonce} ++ /// ++ public string SignConnectPayloadV2( ++ string nonce, ++ long signedAtMs, ++ string clientId, ++ string clientMode, ++ string role, ++ IEnumerable scopes, ++ string authToken) ++ { ++ if (_privateKey == null) ++ throw new InvalidOperationException("Device not initialized"); ++ ++ var payload = BuildConnectPayloadV2( ++ nonce, ++ signedAtMs, ++ clientId, ++ clientMode, ++ role, ++ scopes, ++ authToken); ++ ++ var dataBytes = Encoding.UTF8.GetBytes(payload); ++ var signature = Ed25519Algorithm.Sign(_privateKey, dataBytes); ++ return Base64UrlEncode(signature); ++ } ++ ++ /// ++ /// Build the v2 connect payload string for signing/debugging. ++ /// Format: v2|{deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{tokenOrEmpty}|{nonce} ++ /// ++ public string BuildConnectPayloadV2( ++ string nonce, ++ long signedAtMs, ++ string clientId, ++ string clientMode, ++ string role, ++ IEnumerable scopes, ++ string authToken) ++ { ++ if (_deviceId == null) ++ throw new InvalidOperationException("Device not initialized"); ++ ++ var scopesCsv = string.Join(",", scopes ?? Array.Empty()); ++ var safeToken = authToken ?? string.Empty; ++ var safeNonce = nonce ?? string.Empty; ++ ++ return $"v2|{_deviceId}|{clientId}|{clientMode}|{role}|{scopesCsv}|{signedAtMs}|{safeToken}|{safeNonce}"; ++ } + + /// + /// Build the payload string (for debugging) +diff --git a/src/OpenClaw.Shared/OpenClawGatewayClient.cs b/src/OpenClaw.Shared/OpenClawGatewayClient.cs +index 0e21836..4872758 100644 +--- a/src/OpenClaw.Shared/OpenClawGatewayClient.cs ++++ b/src/OpenClaw.Shared/OpenClawGatewayClient.cs +@@ -1,5 +1,7 @@ + using System; + using System.Collections.Generic; ++using System.IO; ++using System.Text; + using System.Text.Json; + using System.Threading; + using System.Threading.Tasks; +@@ -8,6 +10,29 @@ namespace OpenClaw.Shared; + + public class OpenClawGatewayClient : WebSocketClientBase + { ++ private const string OperatorClientId = "cli"; ++ private const string OperatorClientDisplayName = "OpenClaw Windows Tray"; ++ private const string OperatorClientMode = "cli"; ++ private const string OperatorRole = "operator"; ++ private const string OperatorPlatform = "windows"; ++ private const string OperatorDeviceFamily = "desktop"; ++ private static readonly string[] s_operatorScopes = ++ [ ++ "operator.admin", ++ "operator.read", ++ "operator.write", ++ "operator.approvals", ++ "operator.pairing" ++ ]; ++ ++ private enum SignatureTokenMode ++ { ++ V3AuthToken, ++ V3EmptyToken, ++ V2AuthToken, ++ V2EmptyToken ++ } ++ + // Tracked state + private readonly Dictionary _sessions = new(); + private readonly Dictionary _nodes = new(); +@@ -15,13 +40,24 @@ public class OpenClawGatewayClient : WebSocketClientBase + private GatewayUsageStatusInfo? _usageStatus; + private GatewayCostUsageInfo? _usageCost; + private readonly Dictionary _pendingRequestMethods = new(); ++ private readonly Dictionary> _pendingChatSendRequests = new(); + private readonly object _pendingRequestLock = new(); ++ private readonly object _pendingChatSendLock = new(); + private readonly object _sessionsLock = new(); + private readonly object _nodesLock = new(); ++ private readonly DeviceIdentity _deviceIdentity; ++ private string _mainSessionKey = "main"; ++ private string? _operatorDeviceId; ++ private string[] _grantedOperatorScopes = Array.Empty(); ++ private string _connectAuthToken; ++ private SignatureTokenMode _signatureTokenMode = SignatureTokenMode.V3AuthToken; ++ private long? _challengeTimestampMs; + private bool _usageStatusUnsupported; + private bool _usageCostUnsupported; + private bool _sessionPreviewUnsupported; + private bool _nodeListUnsupported; ++ private bool _operatorReadScopeUnavailable; ++ private bool _pairingRequiredAwaitingApproval; + + private void ResetUnsupportedMethodFlags() + { +@@ -29,6 +65,7 @@ private void ResetUnsupportedMethodFlags() + _usageCostUnsupported = false; + _sessionPreviewUnsupported = false; + _nodeListUnsupported = false; ++ _operatorReadScopeUnavailable = false; + } + + protected override int ReceiveBufferSize => 16384; +@@ -46,6 +83,11 @@ protected override Task OnConnectedAsync() + return Task.CompletedTask; + } + ++ protected override bool ShouldAutoReconnect() ++ { ++ return !_pairingRequiredAwaitingApproval; ++ } ++ + protected override void OnDisconnected() + { + ClearPendingRequests(); +@@ -68,9 +110,20 @@ protected override void OnDisposing() + public event EventHandler? SessionPreviewUpdated; + public event EventHandler? SessionCommandCompleted; + ++ public string? OperatorDeviceId => _operatorDeviceId; ++ public IReadOnlyList GrantedOperatorScopes => _grantedOperatorScopes; ++ public bool IsConnectedToGateway => IsConnected; ++ + public OpenClawGatewayClient(string gatewayUrl, string token, IOpenClawLogger? logger = null) + : base(gatewayUrl, token, logger) + { ++ var dataPath = Path.Combine( ++ Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), ++ "OpenClawTray"); ++ ++ _deviceIdentity = new DeviceIdentity(dataPath, _logger); ++ _deviceIdentity.Initialize(); ++ _connectAuthToken = _deviceIdentity.DeviceToken ?? _token; + } + + public async Task DisconnectAsync() +@@ -118,31 +171,58 @@ public async Task CheckHealthAsync() + } + } + +- public async Task SendChatMessageAsync(string message) ++ public async Task SendChatMessageAsync(string message, string? sessionKey = null) + { + if (!IsConnected) + throw new InvalidOperationException("Gateway connection is not open"); ++ if (string.IsNullOrWhiteSpace(message)) ++ throw new ArgumentException("Message is required", nameof(message)); ++ ++ var effectiveSessionKey = string.IsNullOrWhiteSpace(sessionKey) ++ ? _mainSessionKey ++ : sessionKey.Trim(); ++ ++ var requestId = Guid.NewGuid().ToString(); ++ var completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ TrackPendingChatSend(requestId, completion); + + var req = new + { + type = "req", +- id = Guid.NewGuid().ToString(), ++ id = requestId, + method = "chat.send", +- @params = new { message } ++ @params = new ++ { ++ sessionKey = effectiveSessionKey, ++ message, ++ idempotencyKey = Guid.NewGuid().ToString() ++ } + }; ++ + await SendRawAsync(JsonSerializer.Serialize(req)); ++ ++ var completedTask = await Task.WhenAny(completion.Task, Task.Delay(5000, CancellationToken)); ++ if (completedTask != completion.Task) ++ { ++ RemovePendingChatSend(requestId); ++ throw new TimeoutException("Timed out waiting for chat.send response from gateway"); ++ } ++ ++ await completion.Task; + _logger.Info($"Sent chat message ({message.Length} chars)"); + } + + /// Request session list from gateway. + public async Task RequestSessionsAsync() + { ++ if (_operatorReadScopeUnavailable) return; + await SendTrackedRequestAsync("sessions.list"); + } + + /// Request usage/context info from gateway (may not be supported on all gateways). + public async Task RequestUsageAsync() + { ++ if (_operatorReadScopeUnavailable) return; + if (!IsConnected) return; + try + { +@@ -167,6 +247,7 @@ public async Task RequestUsageAsync() + /// Request connected node inventory from gateway. + public async Task RequestNodesAsync() + { ++ if (_operatorReadScopeUnavailable) return; + if (_nodeListUnsupported) return; + await SendTrackedRequestAsync("node.list"); + } +@@ -281,11 +362,40 @@ public async Task StopChannelAsync(string channelName) + + private async Task SendConnectMessageAsync(string? nonce = null) + { ++ var requestId = Guid.NewGuid().ToString(); ++ TrackPendingRequest(requestId, "connect"); ++ ++ var signedAt = _challengeTimestampMs ?? DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); ++ var connectNonce = nonce ?? string.Empty; ++ var signatureToken = _signatureTokenMode is SignatureTokenMode.V3EmptyToken or SignatureTokenMode.V2EmptyToken ++ ? string.Empty ++ : _connectAuthToken; ++ ++ var signature = _signatureTokenMode is SignatureTokenMode.V2AuthToken or SignatureTokenMode.V2EmptyToken ++ ? _deviceIdentity.SignConnectPayloadV2( ++ connectNonce, ++ signedAt, ++ OperatorClientId, ++ OperatorClientMode, ++ OperatorRole, ++ s_operatorScopes, ++ signatureToken) ++ : _deviceIdentity.SignConnectPayloadV3( ++ connectNonce, ++ signedAt, ++ OperatorClientId, ++ OperatorClientMode, ++ OperatorRole, ++ s_operatorScopes, ++ signatureToken, ++ OperatorPlatform, ++ OperatorDeviceFamily); ++ + // Use "cli" client ID for native apps - no browser security checks + var msg = new + { + type = "req", +- id = Guid.NewGuid().ToString(), ++ id = requestId, + method = "connect", + @params = new + { +@@ -293,23 +403,40 @@ private async Task SendConnectMessageAsync(string? nonce = null) + maxProtocol = 3, + client = new + { +- id = "cli", // Native client ID ++ id = OperatorClientId, // Native client ID + version = "1.0.0", +- platform = "windows", +- mode = "cli", +- displayName = "OpenClaw Windows Tray" ++ platform = OperatorPlatform, ++ mode = OperatorClientMode, ++ displayName = OperatorClientDisplayName + }, +- role = "operator", +- scopes = new[] { "operator.admin", "operator.approvals", "operator.pairing" }, ++ role = OperatorRole, ++ scopes = s_operatorScopes, + caps = Array.Empty(), + commands = Array.Empty(), + permissions = new { }, +- auth = new { token = _token }, ++ auth = new { token = _connectAuthToken }, + locale = "en-US", +- userAgent = "openclaw-windows-tray/1.0.0" ++ userAgent = "openclaw-windows-tray/1.0.0", ++ device = new ++ { ++ id = _deviceIdentity.DeviceId, ++ publicKey = _deviceIdentity.PublicKeyBase64Url, ++ signature, ++ signedAt, ++ nonce = connectNonce ++ } + } + }; +- await SendRawAsync(JsonSerializer.Serialize(msg)); ++ ++ try ++ { ++ await SendRawAsync(JsonSerializer.Serialize(msg)); ++ } ++ catch ++ { ++ RemovePendingRequest(requestId); ++ throw; ++ } + } + + private async Task SendTrackedRequestAsync(string method, object? parameters = null) +@@ -397,6 +524,51 @@ private void ClearPendingRequests() + { + _pendingRequestMethods.Clear(); + } ++ ++ lock (_pendingChatSendLock) ++ { ++ foreach (var completion in _pendingChatSendRequests.Values) ++ { ++ completion.TrySetException(new OperationCanceledException("Request canceled")); ++ } ++ ++ _pendingChatSendRequests.Clear(); ++ } ++ } ++ ++ private void TrackPendingChatSend(string requestId, TaskCompletionSource completion) ++ { ++ lock (_pendingChatSendLock) ++ { ++ _pendingChatSendRequests[requestId] = completion; ++ } ++ } ++ ++ private void RemovePendingChatSend(string requestId) ++ { ++ lock (_pendingChatSendLock) ++ { ++ _pendingChatSendRequests.Remove(requestId); ++ } ++ } ++ ++ private TaskCompletionSource? TakePendingChatSend(string? requestId) ++ { ++ if (string.IsNullOrWhiteSpace(requestId)) ++ { ++ return null; ++ } ++ ++ lock (_pendingChatSendLock) ++ { ++ if (!_pendingChatSendRequests.TryGetValue(requestId, out var completion)) ++ { ++ return null; ++ } ++ ++ _pendingChatSendRequests.Remove(requestId); ++ return completion; ++ } + } + + // --- Message processing --- +@@ -434,9 +606,27 @@ private void ProcessMessage(string json) + private void HandleResponse(JsonElement root) + { + string? requestMethod = null; ++ string? requestId = null; + if (root.TryGetProperty("id", out var idProp)) + { +- requestMethod = TakePendingRequestMethod(idProp.GetString()); ++ requestId = idProp.GetString(); ++ requestMethod = TakePendingRequestMethod(requestId); ++ } ++ ++ var pendingChatSend = TakePendingChatSend(requestId); ++ if (pendingChatSend != null) ++ { ++ if (root.TryGetProperty("ok", out var okChatProp) && ++ okChatProp.ValueKind == JsonValueKind.False) ++ { ++ var message = TryGetErrorMessage(root) ?? "request failed"; ++ _logger.Warn($"chat.send failed: {message}"); ++ pendingChatSend.TrySetException(new InvalidOperationException(message)); ++ return; ++ } ++ ++ pendingChatSend.TrySetResult(true); ++ return; + } + + if (root.TryGetProperty("ok", out var okProp) && +@@ -453,10 +643,31 @@ private void HandleResponse(JsonElement root) + return; + } + +- // Handle hello-ok ++ // Handle handshake acknowledgement payload. + if (payload.TryGetProperty("type", out var t) && t.GetString() == "hello-ok") + { ++ _pairingRequiredAwaitingApproval = false; ++ _operatorDeviceId = TryGetHandshakeDeviceId(payload); ++ _grantedOperatorScopes = TryGetHandshakeScopes(payload); ++ _mainSessionKey = TryGetHandshakeMainSessionKey(payload) ?? "main"; ++ var newDeviceToken = TryGetHandshakeDeviceToken(payload); ++ if (!string.IsNullOrWhiteSpace(newDeviceToken)) ++ { ++ _deviceIdentity.StoreDeviceToken(newDeviceToken); ++ _connectAuthToken = newDeviceToken; ++ _logger.Info("Operator device token stored for reconnect"); ++ } ++ + _logger.Info("Handshake complete (hello-ok)"); ++ if (!string.IsNullOrWhiteSpace(_operatorDeviceId)) ++ { ++ _logger.Info($"Operator device ID: {_operatorDeviceId}"); ++ } ++ if (_grantedOperatorScopes.Length > 0) ++ { ++ _logger.Info($"Granted operator scopes: {string.Join(", ", _grantedOperatorScopes)}"); ++ } ++ _logger.Info($"Main session key: {_mainSessionKey}"); + RaiseStatusChanged(ConnectionStatus.Connected); + + // Request initial state after handshake +@@ -543,6 +754,49 @@ private void HandleRequestError(string? method, JsonElement root) + return; + } + ++ if (method == "connect" && ++ message.Contains("device signature invalid", StringComparison.OrdinalIgnoreCase)) ++ { ++ var previousMode = _signatureTokenMode; ++ _signatureTokenMode = _signatureTokenMode switch ++ { ++ SignatureTokenMode.V3AuthToken => SignatureTokenMode.V3EmptyToken, ++ SignatureTokenMode.V3EmptyToken => SignatureTokenMode.V2AuthToken, ++ SignatureTokenMode.V2AuthToken => SignatureTokenMode.V2EmptyToken, ++ _ => SignatureTokenMode.V2EmptyToken ++ }; ++ ++ if (_signatureTokenMode != previousMode) ++ { ++ _logger.Warn($"Gateway rejected device signature with mode {previousMode}; retrying with mode {_signatureTokenMode}"); ++ return; ++ } ++ ++ _logger.Warn("Gateway rejected device signature in all supported payload modes"); ++ return; ++ } ++ ++ if (method == "connect" && ++ message.Contains("pairing required", StringComparison.OrdinalIgnoreCase)) ++ { ++ _pairingRequiredAwaitingApproval = true; ++ _logger.Warn("Pairing approval required for this device; auto-reconnect paused until manual reconnect or app restart"); ++ RaiseStatusChanged(ConnectionStatus.Error); ++ return; ++ } ++ ++ if (IsMissingScopeError(message, "operator.read") && ++ method is "sessions.list" or "usage.status" or "usage.cost" or "node.list") ++ { ++ if (!_operatorReadScopeUnavailable) ++ { ++ _logger.Warn("Gateway token lacks operator.read; disabling sessions/usage/nodes polling"); ++ } ++ ++ _operatorReadScopeUnavailable = true; ++ return; ++ } ++ + if (IsUnknownMethodError(message)) + { + switch (method) +@@ -631,11 +885,184 @@ private static bool IsUnknownMethodError(string errorMessage) + return errorMessage.Contains("unknown method", StringComparison.OrdinalIgnoreCase); + } + ++ private static bool IsMissingScopeError(string errorMessage, string scope) ++ { ++ if (string.IsNullOrWhiteSpace(errorMessage) || string.IsNullOrWhiteSpace(scope)) ++ return false; ++ ++ var expected = $"missing scope: {scope}"; ++ return errorMessage.Contains(expected, StringComparison.OrdinalIgnoreCase); ++ } ++ + private static bool IsSessionCommandMethod(string method) + { + return method is "sessions.patch" or "sessions.reset" or "sessions.delete" or "sessions.compact"; + } + ++ private static string? TryGetHandshakeDeviceId(JsonElement payload) ++ { ++ if (payload.TryGetProperty("deviceId", out var deviceIdProp) && ++ deviceIdProp.ValueKind == JsonValueKind.String) ++ { ++ return deviceIdProp.GetString(); ++ } ++ ++ if (payload.TryGetProperty("device", out var deviceProp) && ++ deviceProp.ValueKind == JsonValueKind.Object) ++ { ++ if (deviceProp.TryGetProperty("id", out var idProp) && idProp.ValueKind == JsonValueKind.String) ++ { ++ return idProp.GetString(); ++ } ++ ++ if (deviceProp.TryGetProperty("deviceId", out var didProp) && didProp.ValueKind == JsonValueKind.String) ++ { ++ return didProp.GetString(); ++ } ++ } ++ ++ return null; ++ } ++ ++ private static string[] TryGetHandshakeScopes(JsonElement payload) ++ { ++ if (payload.TryGetProperty("scopes", out var scopesProp) && ++ scopesProp.ValueKind == JsonValueKind.Array) ++ { ++ var scopes = new List(); ++ foreach (var scope in scopesProp.EnumerateArray()) ++ { ++ if (scope.ValueKind == JsonValueKind.String) ++ { ++ var value = scope.GetString(); ++ if (!string.IsNullOrWhiteSpace(value)) ++ { ++ scopes.Add(value); ++ } ++ } ++ } ++ ++ return scopes.ToArray(); ++ } ++ ++ return Array.Empty(); ++ } ++ ++ private static string? TryGetHandshakeMainSessionKey(JsonElement payload) ++ { ++ if (!payload.TryGetProperty("snapshot", out var snapshot) || snapshot.ValueKind != JsonValueKind.Object) ++ { ++ return null; ++ } ++ ++ if (!snapshot.TryGetProperty("sessionDefaults", out var sessionDefaults) || sessionDefaults.ValueKind != JsonValueKind.Object) ++ { ++ return null; ++ } ++ ++ if (!sessionDefaults.TryGetProperty("mainKey", out var mainKey) || mainKey.ValueKind != JsonValueKind.String) ++ { ++ return null; ++ } ++ ++ var value = mainKey.GetString(); ++ return string.IsNullOrWhiteSpace(value) ? null : value; ++ } ++ ++ private static string? TryGetHandshakeDeviceToken(JsonElement payload) ++ { ++ if (!payload.TryGetProperty("auth", out var authPayload) || authPayload.ValueKind != JsonValueKind.Object) ++ { ++ return null; ++ } ++ ++ if (!authPayload.TryGetProperty("deviceToken", out var deviceToken) || deviceToken.ValueKind != JsonValueKind.String) ++ { ++ return null; ++ } ++ ++ var value = deviceToken.GetString(); ++ return string.IsNullOrWhiteSpace(value) ? null : value; ++ } ++ ++ public string BuildMissingScopeFixCommands(string missingScope) ++ { ++ var scope = string.IsNullOrWhiteSpace(missingScope) ? "operator.write" : missingScope.Trim(); ++ var grantedScopes = _grantedOperatorScopes.Length == 0 ++ ? "(none reported by gateway)" ++ : string.Join(", ", _grantedOperatorScopes); ++ var deviceId = string.IsNullOrWhiteSpace(_operatorDeviceId) ++ ? "(not reported for this operator connection)" ++ : _operatorDeviceId; ++ var likelyNodeToken = _grantedOperatorScopes.Any(s => s.StartsWith("node.", StringComparison.OrdinalIgnoreCase)); ++ ++ var sb = new StringBuilder(); ++ sb.AppendLine("Quick Send is connected, but your token is missing required permission."); ++ sb.AppendLine($"Missing scope: {scope}"); ++ sb.AppendLine("Note: requested connect scopes are declarative; the gateway may grant fewer scopes based on token/policy/device state."); ++ sb.AppendLine(); ++ sb.AppendLine("Do this in Windows Tray right now:"); ++ sb.AppendLine("1. Right-click the tray icon and open Settings."); ++ sb.AppendLine("2. Replace Gateway Token with an OPERATOR token that includes operator.write."); ++ sb.AppendLine("3. Click Save."); ++ sb.AppendLine("4. Reconnect from the tray menu (or restart the tray app)."); ++ sb.AppendLine("5. Retry Quick Send."); ++ sb.AppendLine(); ++ sb.AppendLine("Token requirements for Quick Send:"); ++ sb.AppendLine("- Role: operator"); ++ sb.AppendLine("- Required scope: operator.write"); ++ sb.AppendLine("- Recommended scopes: operator.admin, operator.read, operator.approvals, operator.pairing, operator.write"); ++ ++ if (likelyNodeToken) ++ { ++ sb.AppendLine(); ++ sb.AppendLine("Detected node.* scopes. This usually means a node token was pasted into Gateway Token."); ++ sb.AppendLine("Quick Send requires an operator token, not a node token."); ++ } ++ ++ sb.AppendLine(); ++ sb.AppendLine("Connection details from this app (for debugging/support):"); ++ sb.AppendLine($"- role: operator"); ++ sb.AppendLine($"- client.id: {OperatorClientId}"); ++ sb.AppendLine($"- client.displayName: {OperatorClientDisplayName}"); ++ sb.AppendLine($"- operator device id: {deviceId}"); ++ sb.AppendLine($"- granted scopes: {grantedScopes}"); ++ sb.AppendLine(); ++ sb.AppendLine("If this still fails after updating the token, copy this block and share it with your gateway admin."); ++ return sb.ToString().TrimEnd(); ++ } ++ ++ public string BuildPairingApprovalFixCommands() ++ { ++ var deviceId = !string.IsNullOrWhiteSpace(_operatorDeviceId) ++ ? _operatorDeviceId ++ : _deviceIdentity.DeviceId; ++ var grantedScopes = _grantedOperatorScopes.Length == 0 ++ ? "(none reported by gateway yet)" ++ : string.Join(", ", _grantedOperatorScopes); ++ ++ var sb = new StringBuilder(); ++ sb.AppendLine("Quick Send requires this device to be approved (paired) in the gateway."); ++ sb.AppendLine("Gateway reported: pairing required"); ++ sb.AppendLine(); ++ sb.AppendLine("Do this now:"); ++ sb.AppendLine("1. Open the gateway admin UI."); ++ sb.AppendLine("2. Go to pending pairing/device approvals."); ++ sb.AppendLine("3. Approve this Windows tray device ID."); ++ sb.AppendLine("4. Return to tray and reconnect (or restart tray app)."); ++ sb.AppendLine("5. Retry Quick Send."); ++ sb.AppendLine(); ++ sb.AppendLine("Connection details from this app (for debugging/support):"); ++ sb.AppendLine("- role: operator"); ++ sb.AppendLine($"- client.id: {OperatorClientId}"); ++ sb.AppendLine($"- client.displayName: {OperatorClientDisplayName}"); ++ sb.AppendLine($"- operator device id: {deviceId}"); ++ sb.AppendLine($"- granted scopes: {grantedScopes}"); ++ sb.AppendLine(); ++ sb.AppendLine("If approval keeps failing, share this block with your gateway admin."); ++ return sb.ToString().TrimEnd(); ++ } ++ + private void HandleEvent(JsonElement root) + { + if (!root.TryGetProperty("event", out var eventProp)) return; +@@ -666,11 +1093,19 @@ private void HandleEvent(JsonElement root) + private void HandleConnectChallenge(JsonElement root) + { + string? nonce = null; ++ long? ts = null; + if (root.TryGetProperty("payload", out var payload) && + payload.TryGetProperty("nonce", out var nonceProp)) + { + nonce = nonceProp.GetString(); ++ ++ if (payload.TryGetProperty("ts", out var tsProp) && tsProp.ValueKind == JsonValueKind.Number) ++ { ++ ts = tsProp.GetInt64(); ++ } + } ++ ++ _challengeTimestampMs = ts; + + _logger.Info($"Received challenge, nonce: {nonce}"); + _ = SendConnectMessageAsync(nonce); +diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs +index 7a2d4b5..27ec9a0 100644 +--- a/src/OpenClaw.Shared/SettingsData.cs ++++ b/src/OpenClaw.Shared/SettingsData.cs +@@ -28,6 +28,7 @@ public class SettingsData + public bool NotifyInfo { get; set; } = true; + public bool EnableNodeMode { get; set; } = false; + public bool HasSeenActivityStreamTip { get; set; } = false; ++ public string? SkippedUpdateTag { get; set; } + public bool NotifyChatResponses { get; set; } = true; + public bool PreferStructuredCategories { get; set; } = true; + public List? UserRules { get; set; } +diff --git a/src/OpenClaw.Shared/WebSocketClientBase.cs b/src/OpenClaw.Shared/WebSocketClientBase.cs +index 72c4d10..0a633e2 100644 +--- a/src/OpenClaw.Shared/WebSocketClientBase.cs ++++ b/src/OpenClaw.Shared/WebSocketClientBase.cs +@@ -19,6 +19,7 @@ public abstract class WebSocketClientBase : IDisposable + private CancellationTokenSource _cts; + private bool _disposed; + private int _reconnectAttempts; ++ private int _reconnectLoopActive; + private static readonly int[] BackoffMs = { 1000, 2000, 4000, 8000, 15000, 30000, 60000 }; + + protected readonly string _token; +@@ -68,6 +69,12 @@ protected virtual void OnError(Exception ex) { } + /// Called at the start of Dispose, before CTS cancellation. + protected virtual void OnDisposing() { } + ++ /// ++ /// Whether auto-reconnect should run after an unexpected disconnect. ++ /// Subclasses can return false for known terminal states (for example awaiting pairing approval). ++ /// ++ protected virtual bool ShouldAutoReconnect() => true; ++ + protected WebSocketClientBase(string gatewayUrl, string token, IOpenClawLogger? logger = null) + { + if (string.IsNullOrEmpty(gatewayUrl)) +@@ -85,6 +92,12 @@ protected WebSocketClientBase(string gatewayUrl, string token, IOpenClawLogger? + + public async Task ConnectAsync() + { ++ if (_disposed) ++ { ++ _logger.Debug($"Skipping {ClientRole} connect: client already disposed"); ++ return; ++ } ++ + try + { + RaiseStatusChanged(ConnectionStatus.Connecting); +@@ -116,10 +129,23 @@ public async Task ConnectAsync() + + _ = Task.Run(() => ListenForMessagesAsync(), _cts.Token); + } ++ catch (OperationCanceledException) ++ { ++ _logger.Debug($"{ClientRole} connect canceled (likely shutdown)"); ++ } ++ catch (ObjectDisposedException) ++ { ++ _logger.Debug($"{ClientRole} connect aborted after dispose"); ++ } + catch (Exception ex) + { + _logger.Error($"{ClientRole} connection failed", ex); + RaiseStatusChanged(ConnectionStatus.Error); ++ ++ if (!_disposed && !_cts.Token.IsCancellationRequested && ShouldAutoReconnect()) ++ { ++ _ = ReconnectWithBackoffAsync(); ++ } + } + } + +@@ -175,7 +201,7 @@ private async Task ListenForMessagesAsync() + { + try + { +- if (!_cts.Token.IsCancellationRequested) ++ if (!_cts.Token.IsCancellationRequested && ShouldAutoReconnect()) + { + await ReconnectWithBackoffAsync(); + } +@@ -186,31 +212,51 @@ private async Task ListenForMessagesAsync() + + protected async Task ReconnectWithBackoffAsync() + { +- var delay = BackoffMs[Math.Min(_reconnectAttempts, BackoffMs.Length - 1)]; +- _reconnectAttempts++; +- _logger.Warn($"{ClientRole} reconnecting in {delay}ms (attempt {_reconnectAttempts})"); +- RaiseStatusChanged(ConnectionStatus.Connecting); ++ if (Interlocked.CompareExchange(ref _reconnectLoopActive, 1, 0) != 0) ++ { ++ return; ++ } + + try + { +- await Task.Delay(delay, _cts.Token); ++ while (!_disposed && !_cts.Token.IsCancellationRequested && ShouldAutoReconnect()) ++ { ++ var delay = BackoffMs[Math.Min(_reconnectAttempts, BackoffMs.Length - 1)]; ++ _reconnectAttempts++; ++ _logger.Warn($"{ClientRole} reconnecting in {delay}ms (attempt {_reconnectAttempts})"); ++ RaiseStatusChanged(ConnectionStatus.Connecting); ++ ++ await Task.Delay(delay, _cts.Token); ++ ++ if (_cts.Token.IsCancellationRequested || _disposed || !ShouldAutoReconnect()) ++ { ++ break; ++ } + +- // Check cancellation after delay +- if (_cts.Token.IsCancellationRequested) return; ++ // Safely dispose old socket ++ var oldSocket = _webSocket; ++ _webSocket = null; ++ try { oldSocket?.Dispose(); } catch { /* ignore dispose errors */ } + +- // Safely dispose old socket +- var oldSocket = _webSocket; +- _webSocket = null; +- try { oldSocket?.Dispose(); } catch { /* ignore dispose errors */ } ++ await ConnectAsync(); + +- await ConnectAsync(); ++ if (IsConnected) ++ { ++ break; ++ } ++ } + } + catch (OperationCanceledException) { } ++ catch (ObjectDisposedException) { } + catch (Exception ex) + { + _logger.Error($"{ClientRole} reconnect failed", ex); + RaiseStatusChanged(ConnectionStatus.Error); + } ++ finally ++ { ++ Interlocked.Exchange(ref _reconnectLoopActive, 0); ++ } + } + + /// Send a text message over the WebSocket. Thread-safe. +diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs +index caff372..a68ad7d 100644 +--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs +@@ -41,6 +41,7 @@ public partial class App : Application + private Mutex? _mutex; + private Microsoft.UI.Dispatching.DispatcherQueue? _dispatcherQueue; + private CancellationTokenSource? _deepLinkCts; ++ private bool _isExiting; + + private ConnectionStatus _currentStatus = ConnectionStatus.Disconnected; + private AgentActivity? _currentActivity; +@@ -235,6 +236,9 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + // Store protocol URI for processing after setup + _pendingProtocolUri = protocolUri; + ++ // Initialize settings before update check so skip selections can be remembered. ++ _settings = new SettingsManager(); ++ + // Register URI scheme on first run + DeepLinkHandler.RegisterUriScheme(); + +@@ -249,8 +253,6 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + // Register toast activation handler + ToastNotificationManagerCompat.OnActivated += OnToastActivated; + +- // Initialize settings +- _settings = new SettingsManager(); + _sshTunnelService = new SshTunnelService(new AppLogger()); + + // First-run check +@@ -1680,7 +1682,7 @@ private void ShowQuickSend(string? prefillMessage = null) + else + { + Logger.Info("QuickSend dialog already open; activating"); +- _quickSendDialog.Activate(); ++ _quickSendDialog.ShowAsync(); + return; + } + } +@@ -1695,7 +1697,7 @@ private void ShowQuickSend(string? prefillMessage = null) + } + }; + _quickSendDialog = dialog; +- dialog.Activate(); ++ dialog.ShowAsync(); + } + catch (Exception ex) + { +@@ -1894,15 +1896,33 @@ private async Task CheckForUpdatesAsync() + var changelog = AppUpdater.GetChangelog(true) ?? "No release notes available."; + Logger.Info($"Update available: {release.TagName}"); + ++ if (!string.IsNullOrWhiteSpace(_settings?.SkippedUpdateTag) && ++ string.Equals(_settings.SkippedUpdateTag, release.TagName, StringComparison.OrdinalIgnoreCase)) ++ { ++ Logger.Info($"Skipping update prompt for remembered version {release.TagName}"); ++ return true; ++ } ++ + var dialog = new UpdateDialog(release.TagName, changelog); + var result = await dialog.ShowAsync(); + + if (result == UpdateDialogResult.Download) + { ++ if (_settings != null) ++ { ++ _settings.SkippedUpdateTag = string.Empty; ++ _settings.Save(); ++ } + var installed = await DownloadAndInstallUpdateAsync(); + return !installed; // Don't launch if update succeeded + } + ++ if (result == UpdateDialogResult.Skip && _settings != null) ++ { ++ _settings.SkippedUpdateTag = release.TagName ?? string.Empty; ++ _settings.Save(); ++ } ++ + return true; // RemindLater or Skip - continue + } + catch (Exception ex) +@@ -1969,6 +1989,7 @@ private void StartDeepLinkServer() + } + catch (OperationCanceledException) + { ++ Logger.Info("Deep link server stopping (canceled)"); + break; // Normal shutdown + } + catch (Exception ex) +@@ -2058,35 +2079,133 @@ private void OnToastActivated(ToastNotificationActivatedEventArgsCompat args) + + private void ExitApplication() + { ++ if (_isExiting) ++ { ++ Logger.Info("Exit requested while shutdown already in progress"); ++ return; ++ } ++ ++ _isExiting = true; + Logger.Info("Application exiting"); +- ++ + // Cancel background tasks +- _deepLinkCts?.Cancel(); +- ++ if (_deepLinkCts != null) ++ { ++ Logger.Info("Shutdown: canceling deep link server"); ++ try { _deepLinkCts.Cancel(); } catch (Exception ex) { Logger.Warn($"Shutdown: deep link cancel failed: {ex.Message}"); } ++ } ++ + // Stop timers +- _healthCheckTimer?.Stop(); +- _healthCheckTimer?.Dispose(); +- _sessionPollTimer?.Stop(); +- _sessionPollTimer?.Dispose(); +- ++ SafeShutdownStep("health timer", () => ++ { ++ _healthCheckTimer?.Stop(); ++ _healthCheckTimer?.Dispose(); ++ _healthCheckTimer = null; ++ }); ++ ++ SafeShutdownStep("session poll timer", () => ++ { ++ _sessionPollTimer?.Stop(); ++ _sessionPollTimer?.Dispose(); ++ _sessionPollTimer = null; ++ }); ++ + // Cleanup hotkey +- _globalHotkey?.Dispose(); +- +- // Unsubscribe and dispose gateway client +- UnsubscribeGatewayEvents(); +- _gatewayClient?.Dispose(); +- _sshTunnelService?.Dispose(); +- ++ SafeShutdownStep("global hotkey", () => ++ { ++ _globalHotkey?.Dispose(); ++ _globalHotkey = null; ++ }); ++ ++ // Dispose runtime services ++ SafeShutdownStep("gateway client", () => ++ { ++ UnsubscribeGatewayEvents(); ++ _gatewayClient?.Dispose(); ++ _gatewayClient = null; ++ }); ++ ++ SafeShutdownStep("node service", () => ++ { ++ _nodeService?.Dispose(); ++ _nodeService = null; ++ }); ++ ++ SafeShutdownStep("ssh tunnel service", () => ++ { ++ _sshTunnelService?.Dispose(); ++ _sshTunnelService = null; ++ }); ++ ++ // Close windows explicitly for deterministic shutdown tracing. ++ SafeShutdownStep("settings window", () => CloseWindow(_settingsWindow)); ++ _settingsWindow = null; ++ SafeShutdownStep("web chat window", () => CloseWindow(_webChatWindow)); ++ _webChatWindow = null; ++ SafeShutdownStep("status detail window", () => CloseWindow(_statusDetailWindow)); ++ _statusDetailWindow = null; ++ SafeShutdownStep("notification history window", () => CloseWindow(_notificationHistoryWindow)); ++ _notificationHistoryWindow = null; ++ SafeShutdownStep("activity stream window", () => CloseWindow(_activityStreamWindow)); ++ _activityStreamWindow = null; ++ SafeShutdownStep("tray menu window", () => CloseWindow(_trayMenuWindow)); ++ _trayMenuWindow = null; ++ SafeShutdownStep("quick send dialog", () => CloseWindow(_quickSendDialog)); ++ _quickSendDialog = null; ++ SafeShutdownStep("keep alive window", () => CloseWindow(_keepAliveWindow)); ++ _keepAliveWindow = null; ++ + // Dispose tray and mutex +- _trayIcon?.Dispose(); +- _mutex?.Dispose(); +- ++ SafeShutdownStep("tray icon", () => ++ { ++ _trayIcon?.Dispose(); ++ _trayIcon = null; ++ }); ++ ++ SafeShutdownStep("single-instance mutex", () => ++ { ++ _mutex?.Dispose(); ++ _mutex = null; ++ }); ++ + // Dispose cancellation token source +- _deepLinkCts?.Dispose(); +- ++ SafeShutdownStep("deep link token source", () => ++ { ++ _deepLinkCts?.Dispose(); ++ _deepLinkCts = null; ++ }); ++ ++ Logger.Info("Shutdown complete; calling Exit() now"); + Exit(); + } + ++ private static void CloseWindow(Window? window) ++ { ++ try ++ { ++ window?.Close(); ++ } ++ catch ++ { ++ // Let caller log specific failure context. ++ throw; ++ } ++ } ++ ++ private static void SafeShutdownStep(string name, Action action) ++ { ++ try ++ { ++ Logger.Info($"Shutdown: disposing {name}"); ++ action(); ++ Logger.Info($"Shutdown: disposed {name}"); ++ } ++ catch (Exception ex) ++ { ++ Logger.Warn($"Shutdown: failed disposing {name}: {ex.Message}"); ++ } ++ } ++ + private bool EnsureSshTunnelConfigured() + { + if (_settings == null) +diff --git a/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs b/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs +index c8ea27b..1c54f20 100644 +--- a/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs ++++ b/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs +@@ -9,6 +9,7 @@ + using System; + using System.Runtime.InteropServices; + using System.Threading.Tasks; ++using System.Text.RegularExpressions; + using WinUIEx; + + namespace OpenClawTray.Dialogs; +@@ -20,8 +21,8 @@ public sealed class QuickSendDialog : WindowEx + { + private readonly OpenClawGatewayClient _client; + private readonly TextBox _messageTextBox; ++ private readonly TextBox _errorDetailsTextBox; + private readonly Button _sendButton; +- private readonly TextBlock _statusText; + private bool _isSending; + + [DllImport("user32.dll")] +@@ -52,7 +53,7 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu + + // Window setup + Title = LocalizationHelper.GetString("WindowTitle_QuickSend"); +- this.SetWindowSize(400, 200); ++ this.SetWindowSize(420, 260); + this.CenterOnScreen(); + this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected)); + +@@ -65,17 +66,21 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu + this.IsAlwaysOnTop = true; + + // Build UI programmatically (simple dialog) +- var root = new StackPanel ++ var root = new Grid + { +- Spacing = 12, +- Padding = new Thickness(24) ++ RowSpacing = 12 + }; ++ root.RowDefinitions.Add(new RowDefinition { Height = GridLength.Auto }); ++ root.RowDefinitions.Add(new RowDefinition { Height = GridLength.Auto }); ++ root.RowDefinitions.Add(new RowDefinition { Height = new GridLength(1, GridUnitType.Star) }); ++ root.RowDefinitions.Add(new RowDefinition { Height = GridLength.Auto }); + + var header = new TextBlock + { + Text = LocalizationHelper.GetString("QuickSend_Header"), + Style = (Style)Application.Current.Resources["SubtitleTextBlockStyle"] + }; ++ Grid.SetRow(header, 0); + root.Children.Add(header); + + _messageTextBox = new TextBox +@@ -85,8 +90,24 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu + Text = prefillMessage ?? "" + }; + _messageTextBox.KeyDown += OnKeyDown; ++ Grid.SetRow(_messageTextBox, 1); + root.Children.Add(_messageTextBox); + ++ _errorDetailsTextBox = new TextBox ++ { ++ Visibility = Visibility.Collapsed, ++ IsReadOnly = true, ++ IsTabStop = true, ++ AcceptsReturn = true, ++ TextWrapping = TextWrapping.Wrap, ++ MinHeight = 80, ++ MaxHeight = 240, ++ VerticalAlignment = VerticalAlignment.Stretch ++ }; ++ ScrollViewer.SetVerticalScrollBarVisibility(_errorDetailsTextBox, ScrollBarVisibility.Auto); ++ Grid.SetRow(_errorDetailsTextBox, 2); ++ root.Children.Add(_errorDetailsTextBox); ++ + var buttonPanel = new StackPanel + { + Orientation = Orientation.Horizontal, +@@ -94,13 +115,6 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu + HorizontalAlignment = HorizontalAlignment.Right + }; + +- _statusText = new TextBlock +- { +- VerticalAlignment = VerticalAlignment.Center, +- Margin = new Thickness(0, 0, 12, 0) +- }; +- buttonPanel.Children.Add(_statusText); +- + var cancelButton = new Button { Content = LocalizationHelper.GetString("QuickSend_CancelButton") }; + cancelButton.Click += (s, e) => Close(); + buttonPanel.Children.Add(cancelButton); +@@ -113,15 +127,20 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu + _sendButton.Click += OnSendClick; + buttonPanel.Children.Add(_sendButton); + ++ Grid.SetRow(buttonPanel, 3); + root.Children.Add(buttonPanel); + +- Content = root; ++ Content = new Border ++ { ++ Padding = new Thickness(24), ++ Child = root ++ }; + + // Focus the text box when shown + Activated += (s, e) => + { +- _messageTextBox.Focus(FocusState.Programmatic); + TryBringToFront(); ++ RequestInputFocus(); + }; + + Closed += (s, e) => Logger.Info("[QuickSend] Dialog closed"); +@@ -170,13 +189,22 @@ private async Task SendMessageAsync() + var message = _messageTextBox.Text?.Trim(); + if (string.IsNullOrEmpty(message)) return; + ++ _errorDetailsTextBox.Visibility = Visibility.Collapsed; ++ _errorDetailsTextBox.Text = string.Empty; ++ this.SetWindowSize(420, 260); ++ + _isSending = true; + _sendButton.IsEnabled = false; + _messageTextBox.IsEnabled = false; +- _statusText.Text = LocalizationHelper.GetString("QuickSend_Sending"); ++ ShowDetails(LocalizationHelper.GetString("QuickSend_Sending")); + + try + { ++ if (!await EnsureGatewayConnectedAsync()) ++ { ++ throw new InvalidOperationException("Gateway connection is not open"); ++ } ++ + await _client.SendChatMessageAsync(message); + Logger.Info($"[QuickSend] Message sent ({message.Length} chars)"); + new ToastContentBuilder() +@@ -188,15 +216,160 @@ private async Task SendMessageAsync() + catch (Exception ex) + { + Logger.Error($"Quick send failed: {ex.Message}"); +- _statusText.Text = LocalizationHelper.GetString("QuickSend_Failed"); ++ if (IsPairingRequired(ex.Message)) ++ { ++ var commands = _client.BuildPairingApprovalFixCommands(); ++ CopyTextToClipboard(commands); ++ ++ ShowErrorDetails($"Pairing approval required\n\n{commands}"); ++ new ToastContentBuilder() ++ .AddText("Quick Send device approval required") ++ .AddText("Gateway reported pairing required. Approval guidance copied to clipboard.") ++ .Show(); ++ Logger.Warn($"[QuickSend] Pairing required. Commands copied to clipboard.\n{commands}"); ++ } ++ else if (TryExtractMissingScope(ex.Message, out var missingScope)) ++ { ++ var commands = _client.BuildMissingScopeFixCommands(missingScope); ++ CopyTextToClipboard(commands); ++ ++ ShowErrorDetails($"Missing scope: {missingScope}\n\n{commands}"); ++ new ToastContentBuilder() ++ .AddText("Quick Send permission required") ++ .AddText($"Missing scope '{missingScope}'. Identity + remediation guidance copied to clipboard.") ++ .Show(); ++ Logger.Warn($"[QuickSend] Missing scope '{missingScope}'. Commands copied to clipboard.\n{commands}"); ++ } ++ else ++ { ++ ShowErrorDetails(ex.Message); ++ } ++ + _sendButton.IsEnabled = true; + _messageTextBox.IsEnabled = true; + _isSending = false; + } + } + ++ private void ShowErrorDetails(string details) ++ { ++ _errorDetailsTextBox.Header = LocalizationHelper.GetString("QuickSend_Failed"); ++ _errorDetailsTextBox.MinHeight = 140; ++ _errorDetailsTextBox.Text = details; ++ _errorDetailsTextBox.Visibility = Visibility.Visible; ++ this.SetWindowSize(520, 400); ++ ++ // Move focus to the details box so users can immediately select/copy text. ++ _errorDetailsTextBox.Focus(FocusState.Programmatic); ++ } ++ ++ private void ShowDetails(string details) ++ { ++ _errorDetailsTextBox.Header = null; ++ _errorDetailsTextBox.MinHeight = 80; ++ _errorDetailsTextBox.Text = details; ++ _errorDetailsTextBox.Visibility = Visibility.Visible; ++ this.SetWindowSize(500, 320); ++ } ++ ++ private static bool TryExtractMissingScope(string? message, out string scope) ++ { ++ scope = string.Empty; ++ if (string.IsNullOrWhiteSpace(message)) ++ { ++ return false; ++ } ++ ++ var match = Regex.Match(message, @"missing\s+scope\s*:\s*([A-Za-z0-9._-]+)", RegexOptions.IgnoreCase); ++ if (!match.Success) ++ { ++ return false; ++ } ++ ++ scope = match.Groups[1].Value; ++ return !string.IsNullOrWhiteSpace(scope); ++ } ++ ++ private static bool IsPairingRequired(string? message) ++ { ++ if (string.IsNullOrWhiteSpace(message)) ++ { ++ return false; ++ } ++ ++ return message.Contains("pairing required", StringComparison.OrdinalIgnoreCase) ++ || message.Contains("not paired", StringComparison.OrdinalIgnoreCase) ++ || message.Contains("NOT_PAIRED", StringComparison.OrdinalIgnoreCase); ++ } ++ ++ private static void CopyTextToClipboard(string text) ++ { ++ var data = new global::Windows.ApplicationModel.DataTransfer.DataPackage(); ++ data.SetText(text); ++ global::Windows.ApplicationModel.DataTransfer.Clipboard.SetContent(data); ++ } ++ ++ private void QueueFocusMessageInput() ++ { ++ DispatcherQueue?.TryEnqueue(FocusMessageInput); ++ } ++ ++ private void RequestInputFocus() ++ { ++ QueueFocusMessageInput(); ++ _ = RetryFocusMessageInputAsync(); ++ } ++ ++ private async Task RetryFocusMessageInputAsync() ++ { ++ var delaysMs = new[] { 60, 160, 320 }; ++ foreach (var delay in delaysMs) ++ { ++ await Task.Delay(delay); ++ TryBringToFront(); ++ QueueFocusMessageInput(); ++ } ++ } ++ ++ private async Task EnsureGatewayConnectedAsync(int timeoutMs = 3000) ++ { ++ if (_client.IsConnectedToGateway) ++ { ++ return true; ++ } ++ ++ try ++ { ++ await _client.ConnectAsync(); ++ } ++ catch ++ { ++ // Connect errors are handled by the send flow. ++ } ++ ++ var started = Environment.TickCount64; ++ while (Environment.TickCount64 - started < timeoutMs) ++ { ++ if (_client.IsConnectedToGateway) ++ { ++ return true; ++ } ++ ++ await Task.Delay(120); ++ } ++ ++ return _client.IsConnectedToGateway; ++ } ++ ++ public void FocusMessageInput() ++ { ++ _messageTextBox.Focus(FocusState.Programmatic); ++ _messageTextBox.SelectionStart = _messageTextBox.Text?.Length ?? 0; ++ } ++ + public new void ShowAsync() + { + Activate(); ++ RequestInputFocus(); + } + } +diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +index 5347a6b..f89e513 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +@@ -51,6 +51,7 @@ public class SettingsManager + // Node mode (enables Windows as a node, not just operator) + public bool EnableNodeMode { get; set; } = false; + public bool HasSeenActivityStreamTip { get; set; } = false; ++ public string SkippedUpdateTag { get; set; } = ""; + + public SettingsManager() + { +@@ -88,6 +89,7 @@ public void Load() + NotifyInfo = loaded.NotifyInfo; + EnableNodeMode = loaded.EnableNodeMode; + HasSeenActivityStreamTip = loaded.HasSeenActivityStreamTip; ++ SkippedUpdateTag = loaded.SkippedUpdateTag ?? SkippedUpdateTag; + NotifyChatResponses = loaded.NotifyChatResponses; + PreferStructuredCategories = loaded.PreferStructuredCategories; + if (loaded.UserRules != null) +@@ -130,6 +132,7 @@ public void Save() + NotifyInfo = NotifyInfo, + EnableNodeMode = EnableNodeMode, + HasSeenActivityStreamTip = HasSeenActivityStreamTip, ++ SkippedUpdateTag = string.IsNullOrWhiteSpace(SkippedUpdateTag) ? null : SkippedUpdateTag, + NotifyChatResponses = NotifyChatResponses, + PreferStructuredCategories = PreferStructuredCategories, + UserRules = UserRules +diff --git a/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs b/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs +index 18b7764..556ec71 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/SshTunnelService.cs +@@ -13,6 +13,7 @@ public sealed class SshTunnelService : IDisposable + private readonly IOpenClawLogger _logger; + private Process? _process; + private string? _lastSpec; ++ private bool _stopping; + + public SshTunnelService(IOpenClawLogger logger) + { +@@ -60,6 +61,9 @@ public void Stop() + return; + } + ++ _stopping = true; ++ _logger.Info("Stopping SSH tunnel process"); ++ + try + { + if (!_process.HasExited) +@@ -77,6 +81,7 @@ public void Stop() + try { _process.Dispose(); } catch { } + _process = null; + _lastSpec = null; ++ _stopping = false; + } + } + +@@ -117,7 +122,14 @@ private void StartProcess(string user, string host, int remotePort, int localPor + process.Exited += (_, _) => + { + var exitCode = process.ExitCode; +- _logger.Warn($"SSH tunnel exited (code {exitCode})"); ++ if (_stopping) ++ { ++ _logger.Info($"SSH tunnel exited during shutdown (code {exitCode})"); ++ } ++ else ++ { ++ _logger.Warn($"SSH tunnel exited unexpectedly (code {exitCode})"); ++ } + }; + + try +diff --git a/tests/OpenClaw.Shared.Tests/DeviceIdentityTests.cs b/tests/OpenClaw.Shared.Tests/DeviceIdentityTests.cs +index bf5a59f..654830d 100644 +--- a/tests/OpenClaw.Shared.Tests/DeviceIdentityTests.cs ++++ b/tests/OpenClaw.Shared.Tests/DeviceIdentityTests.cs +@@ -120,6 +120,66 @@ public void BuildDebugPayload_HasCorrectFormat() + finally { Directory.Delete(dir, true); } + } + ++ [IntegrationFact] ++ public void BuildConnectPayloadV3_HasCorrectFormat() ++ { ++ var dir = CreateTempDir(); ++ try ++ { ++ var identity = new DeviceIdentity(dir); ++ identity.Initialize(); ++ ++ var payload = identity.BuildConnectPayloadV3( ++ nonce: "challenge-nonce", ++ signedAtMs: 1711648000000, ++ clientId: "cli", ++ clientMode: "cli", ++ role: "operator", ++ scopes: new[] { "operator.admin", "operator.read", "operator.write" }, ++ authToken: "mytoken123", ++ platform: "windows", ++ deviceFamily: "desktop"); ++ ++ Assert.StartsWith("v3|", payload); ++ Assert.Contains(identity.DeviceId, payload); ++ Assert.Contains("|cli|cli|operator|operator.admin,operator.read,operator.write|", payload); ++ Assert.Contains("|1711648000000|mytoken123|challenge-nonce|windows|desktop", payload); ++ ++ var parts = payload.Split('|'); ++ Assert.Equal(11, parts.Length); ++ } ++ finally { Directory.Delete(dir, true); } ++ } ++ ++ [IntegrationFact] ++ public void BuildConnectPayloadV2_HasCorrectFormat() ++ { ++ var dir = CreateTempDir(); ++ try ++ { ++ var identity = new DeviceIdentity(dir); ++ identity.Initialize(); ++ ++ var payload = identity.BuildConnectPayloadV2( ++ nonce: "challenge-nonce", ++ signedAtMs: 1711648000000, ++ clientId: "cli", ++ clientMode: "cli", ++ role: "operator", ++ scopes: new[] { "operator.admin", "operator.read", "operator.write" }, ++ authToken: "mytoken123"); ++ ++ Assert.StartsWith("v2|", payload); ++ Assert.Contains(identity.DeviceId, payload); ++ Assert.Contains("|cli|cli|operator|operator.admin,operator.read,operator.write|", payload); ++ Assert.Contains("|1711648000000|mytoken123|challenge-nonce", payload); ++ ++ var parts = payload.Split('|'); ++ Assert.Equal(9, parts.Length); ++ } ++ finally { Directory.Delete(dir, true); } ++ } ++ + [IntegrationFact] + public void StoreDeviceToken_PersistsAcrossReload() + { +diff --git a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs +index 424182d..a364231 100644 +--- a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs ++++ b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs +@@ -61,6 +61,24 @@ public string TruncateLabel(string text, int maxLen = 60) + return (string)result!; + } + ++ public Task RegisterPendingChatSend(string requestId) ++ { ++ var completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "TrackPendingChatSend", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ method!.Invoke(_client, new object[] { requestId, completion }); ++ return completion.Task; ++ } ++ ++ public void ProcessRawMessage(string json) ++ { ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "ProcessMessage", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); ++ method!.Invoke(_client, new object[] { json }); ++ } ++ + public SessionInfo[] GetSessionList() + { + return _client.GetSessionList(); +@@ -140,6 +158,26 @@ public GatewayNodeInfo[] ParseNodeListPayload(string payloadJson) + return parsed; + } + ++ public string? ParseHandshakeMainSessionKey(string payloadJson) ++ { ++ using var doc = JsonDocument.Parse(payloadJson); ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "TryGetHandshakeMainSessionKey", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); ++ var result = method!.Invoke(null, new object[] { doc.RootElement.Clone() }); ++ return result as string; ++ } ++ ++ public string? ParseHandshakeDeviceToken(string payloadJson) ++ { ++ using var doc = JsonDocument.Parse(payloadJson); ++ var method = typeof(OpenClawGatewayClient).GetMethod( ++ "TryGetHandshakeDeviceToken", ++ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); ++ var result = method!.Invoke(null, new object[] { doc.RootElement.Clone() }); ++ return result as string; ++ } ++ + public (ChannelHealth[] channels, bool eventFired) ParseChannelHealthPayload(string payloadJson) + { + ChannelHealth[]? parsed = null; +@@ -343,6 +381,109 @@ public void ClassifyTool_MapsEdit() + Assert.Equal(ActivityKind.Edit, helper.ClassifyTool("edit")); + } + ++ [Fact] ++ public async Task PendingChatSend_CompletesOnSuccessfulResponse() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var task = helper.RegisterPendingChatSend("chat-1"); ++ ++ helper.ProcessRawMessage(""" ++ { ++ "type": "res", ++ "id": "chat-1", ++ "ok": true, ++ "payload": { "accepted": true } ++ } ++ """); ++ ++ Assert.True(await task); ++ } ++ ++ [Fact] ++ public async Task PendingChatSend_FailsOnErrorResponse() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var task = helper.RegisterPendingChatSend("chat-2"); ++ ++ helper.ProcessRawMessage(""" ++ { ++ "type": "res", ++ "id": "chat-2", ++ "ok": false, ++ "error": "missing scope: operator.write" ++ } ++ """); ++ ++ var ex = await Assert.ThrowsAsync(async () => await task); ++ Assert.Contains("operator.write", ex.Message, StringComparison.OrdinalIgnoreCase); ++ } ++ ++ [Fact] ++ public void ParseHandshakeMainSessionKey_ReturnsMainKey_WhenPresent() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var key = helper.ParseHandshakeMainSessionKey(""" ++ { ++ "type": "hello-ok", ++ "snapshot": { ++ "sessionDefaults": { ++ "mainKey": "agent:main:123" ++ } ++ } ++ } ++ """); ++ ++ Assert.Equal("agent:main:123", key); ++ } ++ ++ [Fact] ++ public void ParseHandshakeMainSessionKey_ReturnsNull_WhenMissing() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var key = helper.ParseHandshakeMainSessionKey(""" ++ { ++ "type": "hello-ok", ++ "snapshot": { ++ "sessionDefaults": { ++ } ++ } ++ } ++ """); ++ ++ Assert.Null(key); ++ } ++ ++ [Fact] ++ public void ParseHandshakeDeviceToken_ReturnsValue_WhenPresent() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var token = helper.ParseHandshakeDeviceToken(""" ++ { ++ "type": "hello-ok", ++ "auth": { ++ "deviceToken": "device-token-123" ++ } ++ } ++ """); ++ ++ Assert.Equal("device-token-123", token); ++ } ++ ++ [Fact] ++ public void ParseHandshakeDeviceToken_ReturnsNull_WhenMissing() ++ { ++ var helper = new GatewayClientTestHelper(); ++ var token = helper.ParseHandshakeDeviceToken(""" ++ { ++ "type": "hello-ok", ++ "auth": { ++ } ++ } ++ """); ++ ++ Assert.Null(token); ++ } ++ + [Fact] + public void ClassifyTool_MapsWebSearch() + { +diff --git a/tests/OpenClaw.Shared.Tests/WebSocketClientBaseTests.cs b/tests/OpenClaw.Shared.Tests/WebSocketClientBaseTests.cs +index c5106db..fbb8b81 100644 +--- a/tests/OpenClaw.Shared.Tests/WebSocketClientBaseTests.cs ++++ b/tests/OpenClaw.Shared.Tests/WebSocketClientBaseTests.cs +@@ -225,11 +225,11 @@ public async Task ConnectAsync_RaisesStatusChangedConnecting() + var statuses = new List(); + client.StatusChanged += (_, s) => statuses.Add(s); + +- // ConnectAsync will fail (no real server) but should still fire Connecting then Error ++ // ConnectAsync should always emit Connecting. ++ // Depending on timing/shutdown races, it may then emit Error or be canceled. + await client.ConnectAsync(); + + Assert.Contains(ConnectionStatus.Connecting, statuses); +- Assert.Contains(ConnectionStatus.Error, statuses); + client.Dispose(); + } + } +diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +index 887df5b..523fc37 100644 +--- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs ++++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +@@ -31,6 +31,7 @@ public void RoundTrip_AllFields_Preserved() + NotifyInfo = true, + EnableNodeMode = true, + HasSeenActivityStreamTip = true, ++ SkippedUpdateTag = "v1.2.3", + NotifyChatResponses = false, + PreferStructuredCategories = true, + UserRules = new List +@@ -64,6 +65,7 @@ public void RoundTrip_AllFields_Preserved() + Assert.Equal(original.NotifyInfo, restored.NotifyInfo); + Assert.Equal(original.EnableNodeMode, restored.EnableNodeMode); + Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip); ++ Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag); + Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); + Assert.Equal(original.PreferStructuredCategories, restored.PreferStructuredCategories); + Assert.NotNull(restored.UserRules); +@@ -114,6 +116,7 @@ public void MissingFields_UseDefaults() + Assert.True(settings.NotifyInfo); + Assert.False(settings.EnableNodeMode); + Assert.False(settings.HasSeenActivityStreamTip); ++ Assert.Null(settings.SkippedUpdateTag); + Assert.True(settings.NotifyChatResponses); + Assert.True(settings.PreferStructuredCategories); + Assert.Null(settings.UserRules); +@@ -156,6 +159,7 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() + Assert.True(settings.PreferStructuredCategories); + Assert.False(settings.EnableNodeMode); + Assert.False(settings.HasSeenActivityStreamTip); ++ Assert.Null(settings.SkippedUpdateTag); + Assert.True(settings.GlobalHotkeyEnabled); + Assert.Null(settings.UserRules); + } + +From 94fb82e3d5a0624686365b26cfaea99bee937cb3 Mon Sep 17 00:00:00 2001 +From: sytone +Date: Sat, 28 Mar 2026 18:32:12 -0700 +Subject: [PATCH 3/3] feat: Implement BringToFront method to manage window + focus and visibility + +--- + .../Services/NodeService.cs | 1 + + .../Windows/CanvasWindow.xaml.cs | 45 +++++++++++++++++++ + 2 files changed, 46 insertions(+) + +diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +index 1bd3883..731359f 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +@@ -194,6 +194,7 @@ private void OnCanvasPresent(object? sender, CanvasPresentArgs args) + + // Show window + _canvasWindow.Activate(); ++ _canvasWindow.BringToFront(args.AlwaysOnTop); + + _logger.Info($"Canvas presented: {args.Width}x{args.Height}"); + } +diff --git a/src/OpenClaw.Tray.WinUI/Windows/CanvasWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/CanvasWindow.xaml.cs +index 8ba163d..5f4a4f8 100644 +--- a/src/OpenClaw.Tray.WinUI/Windows/CanvasWindow.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/Windows/CanvasWindow.xaml.cs +@@ -2,6 +2,7 @@ + using System.IO; + using System.Text.RegularExpressions; + using System.Threading.Tasks; ++using System.Runtime.InteropServices; + using Microsoft.UI.Xaml; + using Microsoft.Web.WebView2.Core; + using OpenClawTray.Helpers; +@@ -16,6 +17,22 @@ namespace OpenClawTray.Windows; + /// + public sealed partial class CanvasWindow : WindowEx + { ++ [DllImport("user32.dll")] ++ private static extern bool SetForegroundWindow(IntPtr hWnd); ++ ++ [DllImport("user32.dll")] ++ private static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); ++ ++ [DllImport("user32.dll")] ++ private static extern bool SetWindowPos(IntPtr hWnd, IntPtr hWndInsertAfter, int X, int Y, int cx, int cy, uint uFlags); ++ ++ private static readonly IntPtr HWND_TOPMOST = new(-1); ++ private static readonly IntPtr HWND_NOTOPMOST = new(-2); ++ private const int SW_SHOWNORMAL = 1; ++ private const uint SWP_NOMOVE = 0x0002; ++ private const uint SWP_NOSIZE = 0x0001; ++ private const uint SWP_SHOWWINDOW = 0x0040; ++ + private bool _isWebViewInitialized; + private string? _pendingUrl; + private string? _pendingHtml; +@@ -331,6 +348,34 @@ public void SetAlwaysOnTop(bool alwaysOnTop) + { + this.IsAlwaysOnTop = alwaysOnTop; + } ++ ++ /// ++ /// Force the window to the front so canvas content is visible immediately. ++ /// ++ public void BringToFront(bool keepTopMost) ++ { ++ try ++ { ++ var hwnd = WinRT.Interop.WindowNative.GetWindowHandle(this); ++ if (hwnd == IntPtr.Zero) ++ { ++ return; ++ } ++ ++ ShowWindow(hwnd, SW_SHOWNORMAL); ++ SetWindowPos(hwnd, HWND_TOPMOST, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE | SWP_SHOWWINDOW); ++ SetForegroundWindow(hwnd); ++ ++ if (!keepTopMost) ++ { ++ SetWindowPos(hwnd, HWND_NOTOPMOST, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE | SWP_SHOWWINDOW); ++ } ++ } ++ catch ++ { ++ // Best-effort focus behavior only. ++ } ++ } + + public async Task EnsureA2UIHostAsync(string url) + { diff --git a/pr120_full.diff b/pr120_full.diff new file mode 100644 index 0000000..3fba36c --- /dev/null +++ b/pr120_full.diff @@ -0,0 +1,33200 @@ +From be624fe4528580ad8ec89e9b92a16a9e16fc408e Mon Sep 17 00:00:00 2001 +From: Nich Overend +Date: Sat, 21 Mar 2026 16:32:48 +0000 +Subject: [PATCH 01/83] Add Windows voice mode foundation and AlwaysOn runtime + +--- + docs/VOICE-MODE.md | 371 ++++++ + .../Capabilities/VoiceCapability.cs | 174 +++ + src/OpenClaw.Shared/SettingsData.cs | 1 + + src/OpenClaw.Shared/VoiceModeSchema.cs | 144 +++ + src/OpenClaw.Tray.WinUI/App.xaml.cs | 56 +- + .../Services/NodeService.cs | 82 +- + .../Services/SettingsManager.cs | 5 +- + .../Services/VoiceProviderCatalogService.cs | 155 +++ + .../Services/VoiceService.cs | 1040 +++++++++++++++++ + .../Windows/VoiceModeWindow.xaml | 92 ++ + .../Windows/VoiceModeWindow.xaml.cs | 330 ++++++ + .../OpenClaw.Shared.Tests/CapabilityTests.cs | 202 ++++ + .../VoiceModeSchemaTests.cs | 77 ++ + .../SettingsRoundTripTests.cs | 52 + + 14 files changed, 2777 insertions(+), 4 deletions(-) + create mode 100644 docs/VOICE-MODE.md + create mode 100644 src/OpenClaw.Shared/Capabilities/VoiceCapability.cs + create mode 100644 src/OpenClaw.Shared/VoiceModeSchema.cs + create mode 100644 src/OpenClaw.Tray.WinUI/Services/VoiceProviderCatalogService.cs + create mode 100644 src/OpenClaw.Tray.WinUI/Services/VoiceService.cs + create mode 100644 src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml + create mode 100644 src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml.cs + create mode 100644 tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs + +diff --git a/docs/VOICE-MODE.md b/docs/VOICE-MODE.md +new file mode 100644 +index 0000000..664be7f +--- /dev/null ++++ b/docs/VOICE-MODE.md +@@ -0,0 +1,371 @@ ++# Voice Mode Architecture ++ ++This document defines the voice subsystem for the Windows node only. It introduces the command surface, persisted settings schema, and minimum runtime boundaries needed to add Windows voice support without reshaping the existing node architecture. ++ ++## Goals ++ ++- Add a node-local voice mode with two activation modes: `wakeword` and `alwaysOn` ++- Use NanoWakeWord for wakeword detection on-device ++- Provide parity targets with the macOS app: ++ - `WakeWord` maps to Voice Wake ++ - `AlwaysOn` maps to Talk Mode ++- Keep STT/TTS provider selection configurable, with Windows implementations as the default built-ins ++- Keep provider-specific STT/TTS concerns separate from the Windows node by default ++- Reuse the existing node capability pattern instead of introducing a parallel control path ++ ++## Non-Goals ++ ++- True full-duplex or chunk-streaming audio transport between node and gateway ++- Provider-specific STT/TTS routing in the Windows node ++- Changes to unrelated project documentation ++ ++## Design Position ++ ++The Windows node should own device-local audio concerns: ++ ++- microphone capture ++- wakeword detection ++- silence detection / utterance segmentation ++- speaker playback ++- device enumeration and persisted local settings ++ ++OpenClaw remains responsible for conversation/session routing and upstream voice orchestration. ++ ++This keeps the Windows node lean for the first implementation and avoids introducing provider-routing settings before they are needed. ++ ++## macOS Parity Mapping ++ ++Windows voice mode aims for functional parity with the existing macOS voice surfaces: ++ ++| Windows Mode | macOS Equivalent | Behavior | ++|---|---|---| ++| `WakeWord` | Voice Wake | passively listen for a trigger phrase, capture one utterance, then submit after end silence | ++| `AlwaysOn` | Talk Mode | continuous listen -> think -> speak loop with barge-in support, while still remaining turn-based rather than true simultaneous duplex audio | ++ ++For v1 on Windows, `AlwaysOn` is Talk Mode parity, not a new full-duplex transport. ++The current implementation is still turn-based: listen, send transcript, wait, speak, resume listening. ++ ++## Transport Boundary ++ ++For macOS parity, `AlwaysOn` should follow Talk Mode's documented control flow: ++ ++- the node captures audio locally ++- local speech recognition turns that audio into transcript text ++- the transcript is sent to OpenClaw via `chat.send` on the main session ++- OpenClaw returns the assistant reply as normal chat output ++- the node performs local TTS playback of that reply ++ ++That means the first Windows parity target is transcript transport, not raw audio upload. Streaming audio frames in or out of OpenClaw remains a future protocol extension, not part of this design. ++ ++The current Windows implementation uses a voice-local operator connection inside the tray app while node mode is active. That sidecar connection exists only to carry `chat.send` and assistant chat events for `AlwaysOn`. ++ ++## Provider Selection ++ ++Voice settings now carry explicit provider ids for both STT and TTS: ++ ++- `Voice.SpeechToTextProviderId` ++- `Voice.TextToSpeechProviderId` ++ ++The built-in default for both is `windows`. ++ ++Runtime behavior in the current phase: ++ ++- `windows` is implemented for both STT and TTS ++- non-Windows providers can be selected and persisted now ++- unsupported providers fall back to Windows at runtime with a status warning ++ ++### Local Provider Catalog ++ ++Additional provider entries are supplied through a local catalog file: ++ ++- `%APPDATA%\\OpenClawTray\\voice-providers.json` ++ ++Example: ++ ++```json ++{ ++ "speechToTextProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Recognition", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows dictation and speech recognition." ++ }, ++ { ++ "id": "minimax", ++ "name": "MiniMax Speech To Text", ++ "runtime": "gateway", ++ "enabled": true, ++ "description": "Planned future provider." ++ } ++ ], ++ "textToSpeechProviders": [ ++ { ++ "id": "windows", ++ "name": "Windows Speech Synthesis", ++ "runtime": "windows", ++ "enabled": true, ++ "description": "Built-in Windows text-to-speech playback." ++ }, ++ { ++ "id": "elevenlabs", ++ "name": "ElevenLabs", ++ "runtime": "gateway", ++ "enabled": true, ++ "description": "Planned future provider." ++ } ++ ] ++} ++``` ++ ++This file only defines selectable providers. It does not carry API keys. ++ ++### OpenClaw Configuration Discovery ++ ++It may be technically possible to inspect parts of the OpenClaw configuration surface to infer preferred providers. However, the documented config protocol notes that sensitive fields have no redaction layer, so automatically pulling provider credentials into the Windows tray is not a safe default. ++ ++Because of that, this design keeps provider selection local for now: ++ ++- local tray settings choose the preferred STT/TTS provider ids ++- OpenClaw remains the conversation endpoint for `chat.send` ++- future provider adapters can decide whether they use local credentials, gateway-owned credentials, or both ++ ++For `WakeWord`, trigger words are gateway-owned global state. The Windows node should eventually consume the same shared trigger list and keep only a local enabled/disabled toggle plus device/runtime settings. ++ ++## Command Surface ++ ++The voice subsystem is introduced as a new node capability category: `voice`. ++ ++### Commands ++ ++| Command | Purpose | Request Payload | Response Payload | ++|---|---|---|---| ++| `voice.devices.list` | Enumerate input/output audio devices | none | `VoiceAudioDeviceInfo[]` | ++| `voice.settings.get` | Return the effective voice configuration | none | `VoiceSettings` | ++| `voice.settings.set` | Update the voice configuration | `VoiceSettingsUpdateArgs` | `VoiceSettings` | ++| `voice.status.get` | Return runtime voice status | none | `VoiceStatusInfo` | ++| `voice.start` | Start the voice runtime with the supplied or persisted mode | `VoiceStartArgs` | `VoiceStatusInfo` | ++| `voice.stop` | Stop the voice runtime | `VoiceStopArgs` | `VoiceStatusInfo` | ++ ++### Payload Types ++ ++- `VoiceSettings` ++- `VoiceWakeWordSettings` ++- `VoiceAlwaysOnSettings` ++- `VoiceAudioDeviceInfo` ++- `VoiceStatusInfo` ++- `VoiceStartArgs` ++- `VoiceStopArgs` ++- `VoiceSettingsUpdateArgs` ++ ++These contracts are defined in [VoiceModeSchema.cs](C:/dev/openclaw-windows-node/src/OpenClaw.Shared/VoiceModeSchema.cs). ++ ++## Settings Schema ++ ++Voice settings are persisted as `SettingsData.Voice` in [SettingsData.cs](C:/dev/openclaw-windows-node/src/OpenClaw.Shared/SettingsData.cs). ++ ++### Effective Schema ++ ++```json ++{ ++ "Voice": { ++ "Mode": "WakeWord", ++ "Enabled": true, ++ "SpeechToTextProviderId": "windows", ++ "TextToSpeechProviderId": "windows", ++ "InputDeviceId": "default-mic", ++ "OutputDeviceId": "default-speaker", ++ "SampleRateHz": 16000, ++ "CaptureChunkMs": 80, ++ "BargeInEnabled": true, ++ "WakeWord": { ++ "Engine": "NanoWakeWord", ++ "ModelId": "hey_openclaw", ++ "TriggerThreshold": 0.65, ++ "TriggerCooldownMs": 2000, ++ "PreRollMs": 1200, ++ "EndSilenceMs": 900 ++ }, ++ "AlwaysOn": { ++ "MinSpeechMs": 250, ++ "EndSilenceMs": 900, ++ "MaxUtteranceMs": 15000, ++ "AutoSubmit": true ++ } ++ } ++} ++``` ++ ++### Field Rationale ++ ++| Field | Purpose | ++|---|---| ++| `Mode` | Top-level activation mode: `Off`, `WakeWord`, `AlwaysOn` | ++| `Enabled` | Global feature kill-switch independent of mode | ++| `SpeechToTextProviderId` | Selected STT provider id from the local provider catalog | ++| `TextToSpeechProviderId` | Selected TTS provider id from the local provider catalog | ++| `InputDeviceId` / `OutputDeviceId` | Stable audio device binding | ++| `SampleRateHz` | Shared capture sample rate, fixed to a speech-friendly default | ++| `CaptureChunkMs` | Frame size for capture, VAD, and wakeword processing | ++| `BargeInEnabled` | Allows microphone capture while audio playback is active | ++| `WakeWord.*` | NanoWakeWord and post-trigger utterance capture tuning | ++| `AlwaysOn.*` | Continuous-listening segmentation tuning | ++ ++### Complete Settings Definition ++ ++| Setting | Type | Default | Applies To | Meaning | ++|---|---|---|---|---| ++| `Voice.Mode` | enum | `Off` | all | Activation mode: `Off`, `WakeWord`, `AlwaysOn` | ++| `Voice.Enabled` | bool | `false` | all | Master enable/disable flag for voice mode | ++| `Voice.SpeechToTextProviderId` | string | `windows` | all | Preferred speech-to-text provider id | ++| `Voice.TextToSpeechProviderId` | string | `windows` | all | Preferred text-to-speech provider id | ++| `Voice.InputDeviceId` | string? | `null` | all | Preferred microphone device id; `null` means system default | ++| `Voice.OutputDeviceId` | string? | `null` | all | Preferred speaker device id; `null` means system default | ++| `Voice.SampleRateHz` | int | `16000` | all | Internal capture rate used for wakeword, VAD, and utterance assembly | ++| `Voice.CaptureChunkMs` | int | `80` | all | Audio frame duration used by the capture loop | ++| `Voice.BargeInEnabled` | bool | `true` | all | If `true`, microphone capture may continue while response audio is playing | ++| `Voice.WakeWord.Engine` | string | `NanoWakeWord` | wakeword | Wakeword engine identifier | ++| `Voice.WakeWord.ModelId` | string | `hey_openclaw` | wakeword | Wakeword model/profile identifier | ++| `Voice.WakeWord.TriggerThreshold` | float | `0.65` | wakeword | Minimum score required to trigger wakeword activation | ++| `Voice.WakeWord.TriggerCooldownMs` | int | `2000` | wakeword | Minimum delay before another wakeword trigger is accepted | ++| `Voice.WakeWord.PreRollMs` | int | `1200` | wakeword | Buffered audio retained before the trigger point | ++| `Voice.WakeWord.EndSilenceMs` | int | `900` | wakeword | Silence timeout used to finalize the post-trigger utterance | ++| `Voice.AlwaysOn.MinSpeechMs` | int | `250` | always-on | Minimum detected speech duration before an utterance is treated as real input | ++| `Voice.AlwaysOn.EndSilenceMs` | int | `900` | always-on | Silence timeout used to finalize an utterance | ++| `Voice.AlwaysOn.MaxUtteranceMs` | int | `15000` | always-on | Hard cap on utterance length before forced submission/finalization | ++| `Voice.AlwaysOn.AutoSubmit` | bool | `true` | always-on | If `true`, completed utterances are submitted immediately without extra confirmation | ++ ++At runtime today, those device ids are persisted and surfaced in the UI, but the v1 `AlwaysOn` path still uses the Windows system speech stack defaults for capture and playback. ++ ++## Component Architecture ++ ++```mermaid ++flowchart LR ++ A["NodeService
control + lifecycle"] --> B["VoiceCapability
command surface"] ++ B --> C["VoiceCoordinator
runtime state machine"] ++ C --> D["SpeechRecognizer
Windows continuous dictation"] ++ C --> E["WakeWordService
NanoWakeWord scores"] ++ C --> F["VoiceActivityDetector
speech/silence segments"] ++ C --> G["VoiceTransport
operator sidecar + chat.send exchange"] ++ C --> H["SpeechSynthesizer + MediaPlayer
reply playback"] ++ B --> I["SettingsManager / SettingsData.Voice
persisted config JSON"] ++``` ++ ++## Runtime Data Flow ++ ++### Wakeword Mode ++ ++```mermaid ++flowchart TD ++ A["Microphone device
float/PCM hardware frames"] --> B["AudioCaptureService
PCM16 mono 16kHz chunks"] ++ B --> C["Ring Buffer
bounded pre-roll PCM16 frames"] ++ B --> D["WakeWordService (NanoWakeWord)
wake score per chunk"] ++ D --> E{"score >= trigger threshold?"} ++ E -- "no" --> B ++ E -- "yes" --> F["VoiceCoordinator
WakeWordDetected(session state change)"] ++ F --> G["UtteranceAssembler
seed with pre-roll PCM16 from Ring Buffer"] ++ C --> G ++ B --> G ++ G --> H["VoiceActivityDetector
speech/silence state from PCM16 chunks"] ++ H --> I{"speech still active?"} ++ I -- "yes" --> B ++ I -- "no, end silence reached" --> J["Finalize utterance
PCM16 buffer + timing metadata"] ++ J --> K["SpeechRecognizer
utterance PCM16 -> transcript text"] ++ K --> L["VoiceTransport
chat.send(main, transcript)"] ++ L --> M["OpenClaw conversation pipeline
assistant reply text"] ++ M --> N["AudioPlaybackService
TTS output bytes / decoded PCM"] ++ N --> O["Speaker device
rendered audio"] ++ O --> P{"barge-in enabled?"} ++ P -- "yes" --> B ++ P -- "no, playback complete" --> B ++``` ++ ++### Always-On Mode ++ ++```mermaid ++flowchart TD ++ A["Windows speech input
default microphone path"] --> B["SpeechRecognizer
continuous dictation result text"] ++ B --> C{"final recognized text?"} ++ C -- "no" --> A ++ C -- "yes" --> D["VoiceCoordinator
pause listening and mark AwaitingResponse"] ++ D --> E["VoiceTransport
chat.send(main, transcript)"] ++ E --> F["OpenClaw conversation pipeline
assistant reply text"] ++ F --> G["SpeechSynthesizer
assistant text -> audio stream"] ++ G --> H["MediaPlayer
reply playback"] ++ H --> I["Windows audio output
default speaker path"] ++ I --> J["VoiceCoordinator
resume continuous listening"] ++ J --> A ++``` ++ ++## Processing Stages and Data Types ++ ++| Stage | Component | Input | Output | ++|---|---|---|---| ++| 1 | `SpeechRecognizer` | Windows microphone capture | recognized transcript text | ++| 2a | `WakeWordService` | PCM16 chunk | wake score / trigger decision | ++| 2b | `VoiceActivityDetector` | PCM16 chunk | speech/silence state | ++| 3 | `Ring Buffer` | PCM16 chunk stream | bounded pre-roll PCM16 window | ++| 4 | `UtteranceAssembler` | pre-roll + live PCM16 chunks | utterance PCM16 buffer | ++| 5 | `SpeechRecognizer` | utterance PCM16 + timing metadata | transcript text | ++| 6 | `VoiceTransport` | transcript text + session key | `chat.send` request / assistant reply text | ++| 7 | `SpeechSynthesizer + MediaPlayer` | assistant reply text | speaker render stream | ++ ++## Control Flow ++ ++```mermaid ++sequenceDiagram ++ participant Gateway as Gateway / Operator ++ participant VoiceCap as VoiceCapability ++ participant Coord as VoiceCoordinator ++ participant Store as SettingsData.Voice ++ ++ Gateway->>VoiceCap: voice.settings.get ++ VoiceCap-->>Gateway: VoiceSettings ++ ++ Gateway->>VoiceCap: voice.settings.set(settings, persist=true) ++ VoiceCap->>Store: save VoiceSettings ++ VoiceCap-->>Gateway: VoiceSettings ++ ++ Gateway->>VoiceCap: voice.start(mode=WakeWord, sessionKey=...) ++ VoiceCap->>Coord: Start(VoiceStartArgs) ++ Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningForWakeWord) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.status.get ++ VoiceCap-->>Gateway: VoiceStatusInfo ++ ++ Gateway->>VoiceCap: voice.stop(reason=...) ++ VoiceCap->>Coord: Stop() ++ Coord-->>VoiceCap: VoiceStatusInfo(state=Stopped) ++ VoiceCap-->>Gateway: VoiceStatusInfo ++``` ++ ++## Integration Boundaries ++ ++### Existing Components Reused ++ ++- `NodeService` remains the capability registration and lifecycle owner ++- `SettingsData` remains the persisted JSON settings model ++- `WindowsNodeClient` remains the gateway/node transport ++- existing node capability registration remains the integration pattern ++- current request/response transport remains the v1 control plane ++- `AlwaysOn` parity should reuse existing `chat.send` message flow instead of inventing an audio-upload protocol ++ ++### New Components Expected Later ++ ++- `VoiceCapability` in `OpenClaw.Shared.Capabilities` ++- `AudioCaptureService` in `OpenClaw.Tray.WinUI.Services` ++- `WakeWordService` in `OpenClaw.Tray.WinUI.Services` ++- `VoiceCoordinator` in `OpenClaw.Tray.WinUI.Services` ++- `AudioPlaybackService` in `OpenClaw.Tray.WinUI.Services` ++ ++## Why Provider Support Is Abstracted ++ ++Minimax and ElevenLabs are valid future targets, but binding provider choice into the Windows node now would introduce: ++ ++- duplicated provider integration work already handled by OpenClaw ++- local credential management on Windows ++- tighter coupling between node runtime and vendor APIs ++ ++For the first implementation, the Windows node should manage local audio behavior, local speech recognition, and local playback while reusing existing OpenClaw message flows for conversation. If provider routing becomes a real requirement later, it can be added back without changing the core activation-mode model. +diff --git a/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs +new file mode 100644 +index 0000000..37d98fa +--- /dev/null ++++ b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs +@@ -0,0 +1,174 @@ ++using System; ++using System.Collections.Generic; ++using System.Text.Json; ++using System.Threading.Tasks; ++ ++namespace OpenClaw.Shared.Capabilities; ++ ++public class VoiceCapability : NodeCapabilityBase ++{ ++ private static readonly JsonSerializerOptions s_jsonOptions = new() ++ { ++ PropertyNameCaseInsensitive = true ++ }; ++ ++ public override string Category => "voice"; ++ ++ public override IReadOnlyList Commands => VoiceCommands.All; ++ ++ public event Func>? ListDevicesRequested; ++ public event Func>? SettingsRequested; ++ public event Func>? SettingsUpdateRequested; ++ public event Func>? StatusRequested; ++ public event Func>? StartRequested; ++ public event Func>? StopRequested; ++ ++ public VoiceCapability(IOpenClawLogger logger) : base(logger) ++ { ++ } ++ ++ public override async Task ExecuteAsync(NodeInvokeRequest request) ++ { ++ return request.Command switch ++ { ++ VoiceCommands.ListDevices => await HandleListDevicesAsync(), ++ VoiceCommands.GetSettings => await HandleGetSettingsAsync(), ++ VoiceCommands.SetSettings => await HandleSetSettingsAsync(request), ++ VoiceCommands.GetStatus => await HandleGetStatusAsync(), ++ VoiceCommands.Start => await HandleStartAsync(request), ++ VoiceCommands.Stop => await HandleStopAsync(request), ++ _ => Error($"Unknown command: {request.Command}") ++ }; ++ } ++ ++ private async Task HandleListDevicesAsync() ++ { ++ Logger.Info(VoiceCommands.ListDevices); ++ ++ if (ListDevicesRequested == null) ++ return Error("Voice device enumeration not available"); ++ ++ try ++ { ++ return Success(await ListDevicesRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice device enumeration failed", ex); ++ return Error($"Device enumeration failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleGetSettingsAsync() ++ { ++ Logger.Info(VoiceCommands.GetSettings); ++ ++ if (SettingsRequested == null) ++ return Error("Voice settings not available"); ++ ++ try ++ { ++ return Success(await SettingsRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice settings get failed", ex); ++ return Error($"Get settings failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleSetSettingsAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.SetSettings); ++ ++ if (SettingsUpdateRequested == null) ++ return Error("Voice settings update not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ VoiceSettingsUpdateArgs? update = null; ++ if (request.Args.ValueKind == JsonValueKind.Object && ++ request.Args.TryGetProperty("update", out var updateEl)) ++ { ++ update = JsonSerializer.Deserialize(updateEl.GetRawText(), s_jsonOptions); ++ } ++ ++ update ??= JsonSerializer.Deserialize(rawArgs, s_jsonOptions); ++ ++ if (update == null) ++ return Error("Missing update payload"); ++ ++ return Success(await SettingsUpdateRequested(update)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice settings update failed", ex); ++ return Error($"Set settings failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleGetStatusAsync() ++ { ++ Logger.Info(VoiceCommands.GetStatus); ++ ++ if (StatusRequested == null) ++ return Error("Voice status not available"); ++ ++ try ++ { ++ return Success(await StatusRequested()); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice status get failed", ex); ++ return Error($"Get status failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleStartAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Start); ++ ++ if (StartRequested == null) ++ return Error("Voice start not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStartArgs(); ++ return Success(await StartRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice start failed", ex); ++ return Error($"Start failed: {ex.Message}"); ++ } ++ } ++ ++ private async Task HandleStopAsync(NodeInvokeRequest request) ++ { ++ Logger.Info(VoiceCommands.Stop); ++ ++ if (StopRequested == null) ++ return Error("Voice stop not available"); ++ ++ try ++ { ++ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null ++ ? "{}" ++ : request.Args.GetRawText(); ++ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStopArgs(); ++ return Success(await StopRequested(args)); ++ } ++ catch (Exception ex) ++ { ++ Logger.Error("Voice stop failed", ex); ++ return Error($"Stop failed: {ex.Message}"); ++ } ++ } ++} +diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs +index 4c7b075..c7af724 100644 +--- a/src/OpenClaw.Shared/SettingsData.cs ++++ b/src/OpenClaw.Shared/SettingsData.cs +@@ -26,6 +26,7 @@ public class SettingsData + public bool NotifyChatResponses { get; set; } = true; + public bool PreferStructuredCategories { get; set; } = true; + public List? UserRules { get; set; } ++ public VoiceSettings Voice { get; set; } = new(); + + private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true }; + +diff --git a/src/OpenClaw.Shared/VoiceModeSchema.cs b/src/OpenClaw.Shared/VoiceModeSchema.cs +new file mode 100644 +index 0000000..0dce2a5 +--- /dev/null ++++ b/src/OpenClaw.Shared/VoiceModeSchema.cs +@@ -0,0 +1,144 @@ ++using System.Collections.ObjectModel; ++using System.Text.Json.Serialization; ++ ++namespace OpenClaw.Shared; ++ ++public static class VoiceCommands ++{ ++ public const string ListDevices = "voice.devices.list"; ++ public const string GetSettings = "voice.settings.get"; ++ public const string SetSettings = "voice.settings.set"; ++ public const string GetStatus = "voice.status.get"; ++ public const string Start = "voice.start"; ++ public const string Stop = "voice.stop"; ++ ++ private static readonly ReadOnlyCollection s_all = Array.AsReadOnly( ++ [ ++ ListDevices, ++ GetSettings, ++ SetSettings, ++ GetStatus, ++ Start, ++ Stop ++ ]); ++ ++ public static IReadOnlyList All => s_all; ++} ++ ++[JsonConverter(typeof(JsonStringEnumConverter))] ++public enum VoiceActivationMode ++{ ++ Off, ++ WakeWord, ++ AlwaysOn ++} ++ ++[JsonConverter(typeof(JsonStringEnumConverter))] ++public enum VoiceRuntimeState ++{ ++ Stopped, ++ Idle, ++ Arming, ++ ListeningForWakeWord, ++ ListeningContinuously, ++ RecordingUtterance, ++ SubmittingAudio, ++ AwaitingResponse, ++ PlayingResponse, ++ Error ++} ++ ++public sealed class VoiceSettings ++{ ++ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; ++ public bool Enabled { get; set; } ++ public string SpeechToTextProviderId { get; set; } = VoiceProviderIds.Windows; ++ public string TextToSpeechProviderId { get; set; } = VoiceProviderIds.Windows; ++ public string? InputDeviceId { get; set; } ++ public string? OutputDeviceId { get; set; } ++ public int SampleRateHz { get; set; } = 16000; ++ public int CaptureChunkMs { get; set; } = 80; ++ public bool BargeInEnabled { get; set; } = true; ++ public VoiceWakeWordSettings WakeWord { get; set; } = new(); ++ public VoiceAlwaysOnSettings AlwaysOn { get; set; } = new(); ++} ++ ++public sealed class VoiceWakeWordSettings ++{ ++ public string Engine { get; set; } = "NanoWakeWord"; ++ public string ModelId { get; set; } = "hey_openclaw"; ++ public float TriggerThreshold { get; set; } = 0.65f; ++ public int TriggerCooldownMs { get; set; } = 2000; ++ public int PreRollMs { get; set; } = 1200; ++ public int EndSilenceMs { get; set; } = 900; ++} ++ ++public sealed class VoiceAlwaysOnSettings ++{ ++ public int MinSpeechMs { get; set; } = 250; ++ public int EndSilenceMs { get; set; } = 900; ++ public int MaxUtteranceMs { get; set; } = 15000; ++ public bool AutoSubmit { get; set; } = true; ++} ++ ++public sealed class VoiceAudioDeviceInfo ++{ ++ public string DeviceId { get; set; } = ""; ++ public string Name { get; set; } = ""; ++ public bool IsDefault { get; set; } ++ public bool IsInput { get; set; } ++ public bool IsOutput { get; set; } ++} ++ ++public sealed class VoiceStatusInfo ++{ ++ public bool Available { get; set; } ++ public bool Running { get; set; } ++ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; ++ public VoiceRuntimeState State { get; set; } = VoiceRuntimeState.Stopped; ++ public string? SessionKey { get; set; } ++ public string? InputDeviceId { get; set; } ++ public string? OutputDeviceId { get; set; } ++ public string? WakeWordModelId { get; set; } ++ public bool WakeWordLoaded { get; set; } ++ public DateTime? LastWakeWordUtc { get; set; } ++ public DateTime? LastUtteranceUtc { get; set; } ++ public string? LastError { get; set; } ++} ++ ++public sealed class VoiceStartArgs ++{ ++ public VoiceActivationMode? Mode { get; set; } ++ public string? SessionKey { get; set; } ++} ++ ++public sealed class VoiceStopArgs ++{ ++ public string? Reason { get; set; } ++} ++ ++public sealed class VoiceSettingsUpdateArgs ++{ ++ public VoiceSettings Settings { get; set; } = new(); ++ public bool Persist { get; set; } = true; ++} ++ ++public static class VoiceProviderIds ++{ ++ public const string Windows = "windows"; ++} ++ ++public sealed class VoiceProviderOption ++{ ++ public string Id { get; set; } = ""; ++ public string Name { get; set; } = ""; ++ public string Runtime { get; set; } = "windows"; ++ public bool Enabled { get; set; } = true; ++ public string? Description { get; set; } ++} ++ ++public sealed class VoiceProviderCatalog ++{ ++ public List SpeechToTextProviders { get; set; } = []; ++ public List TextToSpeechProviders { get; set; } = []; ++} +diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs +index de0780f..37552b9 100644 +--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs ++++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs +@@ -63,6 +63,7 @@ public partial class App : Application + + // Windows (created on demand) + private SettingsWindow? _settingsWindow; ++ private VoiceModeWindow? _voiceModeWindow; + private WebChatWindow? _webChatWindow; + private StatusDetailWindow? _statusDetailWindow; + private NotificationHistoryWindow? _notificationHistoryWindow; +@@ -72,6 +73,7 @@ public partial class App : Application + + // Node service (optional, enabled in settings) + private NodeService? _nodeService; ++ private VoiceService? _voiceService; + + // Keep-alive window to anchor WinUI runtime (prevents GC/threading issues) + private Window? _keepAliveWindow; +@@ -250,6 +252,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) + + // Initialize settings + _settings = new SettingsManager(); ++ _voiceService = new VoiceService(new AppLogger(), _settings); + + // First-run check + if (string.IsNullOrWhiteSpace(_settings.Token)) +@@ -514,6 +517,7 @@ private void OnTrayMenuItemClicked(object? sender, string action) + switch (action) + { + case "status": ShowStatusDetail(); break; ++ case "voice-settings": ShowVoiceModeSettings(); break; + case "dashboard": OpenDashboard(); break; + case "webchat": ShowWebChat(); break; + case "quicksend": ShowQuickSend(); break; +@@ -725,6 +729,33 @@ private List GetRecentActivity(int maxItems) + .ToList(); + } + ++ private string GetRunningVoiceModeLabel() ++ { ++ var status = _voiceService?.CurrentStatus; ++ if (status?.Running == true) ++ { ++ return status.Mode switch ++ { ++ VoiceActivationMode.WakeWord => "WakeWord", ++ VoiceActivationMode.AlwaysOn => "AlwaysOn", ++ _ => "Off" ++ }; ++ } ++ ++ return "Off"; ++ } ++ ++ private string GetVoiceDeviceSummary() ++ { ++ var voice = _settings?.Voice; ++ if (voice == null) ++ return "Talk: system default · Listen: system default"; ++ ++ var talk = string.IsNullOrWhiteSpace(voice.OutputDeviceId) ? "system default" : "selected speaker"; ++ var listen = string.IsNullOrWhiteSpace(voice.InputDeviceId) ? "system default" : "selected microphone"; ++ return $"Talk: {talk} · Listen: {listen}"; ++ } ++ + private void BuildTrayMenuPopup(TrayMenuWindow menu) + { + // Brand header +@@ -741,6 +772,13 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu) + menu.AddMenuItem(_currentActivity.DisplayText, _currentActivity.Glyph, "", isEnabled: false); + } + ++ menu.AddMenuItem($"Voice Mode: {GetRunningVoiceModeLabel()}", "🎙️", "voice-settings"); ++ menu.AddMenuItem($"↳ {GetVoiceDeviceSummary()}", "", "", isEnabled: false, indent: true); ++ if (_settings?.EnableNodeMode != true) ++ { ++ menu.AddMenuItem("↳ Enable Node Mode to activate voice runtime", "", "", isEnabled: false, indent: true); ++ } ++ + // Usage + if (_lastUsage != null || _lastUsageStatus != null || _lastUsageCost != null) + { +@@ -1126,7 +1164,7 @@ private void InitializeNodeService() + { + Logger.Info("Initializing Windows Node service..."); + +- _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, DataPath); ++ _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, _voiceService!, DataPath); + _nodeService.StatusChanged += OnNodeStatusChanged; + _nodeService.NotificationRequested += OnNodeNotificationRequested; + _nodeService.PairingStatusChanged += OnPairingStatusChanged; +@@ -1601,6 +1639,20 @@ private void ShowSettings() + _settingsWindow.Activate(); + } + ++ private void ShowVoiceModeSettings() ++ { ++ if (_settings == null || _voiceService == null) ++ return; ++ ++ if (_voiceModeWindow == null || _voiceModeWindow.IsClosed) ++ { ++ _voiceModeWindow = new VoiceModeWindow(_settings, _voiceService); ++ _voiceModeWindow.Closed += (s, e) => _voiceModeWindow = null; ++ } ++ ++ _voiceModeWindow.Activate(); ++ } ++ + private void OnSettingsSaved(object? sender, EventArgs e) + { + // Reconnect with new settings — mirror the startup if/else pattern +@@ -1617,6 +1669,7 @@ private void OnSettingsSaved(object? sender, EventArgs e) + else + { + InitializeGatewayClient(); ++ _ = _voiceService?.StopAsync(new VoiceStopArgs { Reason = "Node mode disabled" }); + } + + // Update global hotkey +@@ -2070,6 +2123,7 @@ private void ExitApplication() + + // Dispose cancellation token source + _deepLinkCts?.Dispose(); ++ _voiceService?.Dispose(); + + Exit(); + } +diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +index 1bd3883..62ea080 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +@@ -21,6 +21,7 @@ public class NodeService : IDisposable + private CanvasWindow? _canvasWindow; + private ScreenCaptureService? _screenCaptureService; + private CameraCaptureService? _cameraCaptureService; ++ private VoiceService? _voiceService; + private DateTime _lastScreenCaptureNotification = DateTime.MinValue; + private string? _a2uiHostUrl; + +@@ -29,6 +30,7 @@ public class NodeService : IDisposable + private CanvasCapability? _canvasCapability; + private ScreenCapability? _screenCapability; + private CameraCapability? _cameraCapability; ++ private VoiceCapability? _voiceCapability; + private readonly string _dataPath; + + // Events +@@ -44,13 +46,14 @@ public class NodeService : IDisposable + public string? FullDeviceId => _nodeClient?.FullDeviceId; + public string? GatewayUrl => _nodeClient?.GatewayUrl; + +- public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, string dataPath) ++ public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, VoiceService voiceService, string dataPath) + { + _logger = logger; + _dispatcherQueue = dispatcherQueue; + _dataPath = dataPath; + _screenCaptureService = new ScreenCaptureService(logger); + _cameraCaptureService = new CameraCaptureService(logger); ++ _voiceService = voiceService; + } + + /// +@@ -79,6 +82,15 @@ public async Task ConnectAsync(string gatewayUrl, string token) + await _nodeClient.ConnectAsync(); + + _a2uiHostUrl = BuildA2UIHostUrl(_nodeClient.GatewayUrl); ++ ++ if (_voiceService != null) ++ { ++ var settings = await _voiceService.GetSettingsAsync(); ++ if (settings.Enabled && settings.Mode != VoiceActivationMode.Off) ++ { ++ await _voiceService.StartAsync(new VoiceStartArgs { Mode = settings.Mode }); ++ } ++ } + } + + /// +@@ -92,6 +104,11 @@ public async Task DisconnectAsync() + _nodeClient.Dispose(); + _nodeClient = null; + } ++ ++ if (_voiceService != null) ++ { ++ await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node disconnected" }); ++ } + + // Close canvas window + if (_canvasWindow != null && !_canvasWindow.IsClosed) +@@ -134,6 +151,16 @@ private void RegisterCapabilities() + _cameraCapability.ListRequested += OnCameraList; + _cameraCapability.SnapRequested += OnCameraSnap; + _nodeClient.RegisterCapability(_cameraCapability); ++ ++ // Voice capability ++ _voiceCapability = new VoiceCapability(_logger); ++ _voiceCapability.ListDevicesRequested += OnVoiceListDevices; ++ _voiceCapability.SettingsRequested += OnVoiceGetSettings; ++ _voiceCapability.SettingsUpdateRequested += OnVoiceSetSettings; ++ _voiceCapability.StatusRequested += OnVoiceGetStatus; ++ _voiceCapability.StartRequested += OnVoiceStart; ++ _voiceCapability.StopRequested += OnVoiceStop; ++ _nodeClient.RegisterCapability(_voiceCapability); + + _logger.Info("All capabilities registered"); + } +@@ -474,6 +501,58 @@ private async Task OnCameraSnap(CameraSnapArgs args) + } + } + ++ #endregion ++ ++ #region Voice Capability Handlers ++ ++ private Task OnVoiceListDevices() ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.ListDevicesAsync(); ++ } ++ ++ private Task OnVoiceGetSettings() ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.GetSettingsAsync(); ++ } ++ ++ private Task OnVoiceSetSettings(VoiceSettingsUpdateArgs args) ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.UpdateSettingsAsync(args); ++ } ++ ++ private Task OnVoiceGetStatus() ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.GetStatusAsync(); ++ } ++ ++ private Task OnVoiceStart(VoiceStartArgs args) ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.StartAsync(args); ++ } ++ ++ private Task OnVoiceStop(VoiceStopArgs args) ++ { ++ if (_voiceService == null) ++ throw new InvalidOperationException("Voice service not available"); ++ ++ return _voiceService.StopAsync(args); ++ } ++ + #endregion + + public void Dispose() +@@ -483,7 +562,6 @@ public void Dispose() + try { client?.Dispose(); } catch { /* ignore */ } + + try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ } +- + if (_canvasWindow != null && !_canvasWindow.IsClosed) + { + var window = _canvasWindow; +diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +index 0c343f1..2fc93d7 100644 +--- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs ++++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +@@ -42,6 +42,7 @@ public class SettingsManager + public bool NotifyChatResponses { get; set; } = true; + public bool PreferStructuredCategories { get; set; } = true; + public List UserRules { get; set; } = new(); ++ public VoiceSettings Voice { get; set; } = new(); + + // Node mode (enables Windows as a node, not just operator) + public bool EnableNodeMode { get; set; } = false; +@@ -82,6 +83,7 @@ public void Load() + PreferStructuredCategories = loaded.PreferStructuredCategories; + if (loaded.UserRules != null) + UserRules = loaded.UserRules; ++ Voice = loaded.Voice ?? new VoiceSettings(); + } + } + } +@@ -117,7 +119,8 @@ public void Save() + HasSeenActivityStreamTip = HasSeenActivityStreamTip, + NotifyChatResponses = NotifyChatResponses, + PreferStructuredCategories = PreferStructuredCategories, +- UserRules = UserRules ++ UserRules = UserRules, ++ Voice = Voice + }; + + var json = data.ToJson(); +diff --git a/src/OpenClaw.Tray.WinUI/Services/VoiceProviderCatalogService.cs b/src/OpenClaw.Tray.WinUI/Services/VoiceProviderCatalogService.cs +new file mode 100644 +index 0000000..705336e +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Services/VoiceProviderCatalogService.cs +@@ -0,0 +1,155 @@ ++using System; ++using System.Collections.Generic; ++using System.IO; ++using System.Linq; ++using System.Text.Json; ++using OpenClaw.Shared; ++ ++namespace OpenClawTray.Services; ++ ++public static class VoiceProviderCatalogService ++{ ++ private static readonly string s_catalogFilePath = Path.Combine( ++ Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), ++ "OpenClawTray", ++ "voice-providers.json"); ++ ++ private static readonly JsonSerializerOptions s_jsonOptions = new() ++ { ++ PropertyNameCaseInsensitive = true, ++ WriteIndented = true ++ }; ++ ++ public static string CatalogFilePath => s_catalogFilePath; ++ ++ public static VoiceProviderCatalog LoadCatalog(IOpenClawLogger? logger = null) ++ { ++ var merged = CreateBuiltInCatalog(); ++ ++ try ++ { ++ if (!File.Exists(s_catalogFilePath)) ++ { ++ return merged; ++ } ++ ++ var json = File.ReadAllText(s_catalogFilePath); ++ var configured = JsonSerializer.Deserialize(json, s_jsonOptions); ++ if (configured == null) ++ { ++ return merged; ++ } ++ ++ merged.SpeechToTextProviders = MergeProviders( ++ merged.SpeechToTextProviders, ++ configured.SpeechToTextProviders); ++ merged.TextToSpeechProviders = MergeProviders( ++ merged.TextToSpeechProviders, ++ configured.TextToSpeechProviders); ++ } ++ catch (Exception ex) ++ { ++ logger?.Warn($"Failed to load voice provider catalog: {ex.Message}"); ++ } ++ ++ return merged; ++ } ++ ++ public static VoiceProviderOption ResolveSpeechToTextProvider(string? providerId, IOpenClawLogger? logger = null) ++ { ++ var catalog = LoadCatalog(logger); ++ return ResolveProvider(catalog.SpeechToTextProviders, providerId); ++ } ++ ++ public static VoiceProviderOption ResolveTextToSpeechProvider(string? providerId, IOpenClawLogger? logger = null) ++ { ++ var catalog = LoadCatalog(logger); ++ return ResolveProvider(catalog.TextToSpeechProviders, providerId); ++ } ++ ++ public static bool SupportsWindowsRuntime(string? providerId) ++ { ++ return string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase); ++ } ++ ++ private static VoiceProviderCatalog CreateBuiltInCatalog() ++ { ++ return new VoiceProviderCatalog ++ { ++ SpeechToTextProviders = ++ [ ++ new VoiceProviderOption ++ { ++ Id = VoiceProviderIds.Windows, ++ Name = "Windows Speech Recognition", ++ Runtime = "windows", ++ Description = "Built-in Windows dictation and speech recognition." ++ } ++ ], ++ TextToSpeechProviders = ++ [ ++ new VoiceProviderOption ++ { ++ Id = VoiceProviderIds.Windows, ++ Name = "Windows Speech Synthesis", ++ Runtime = "windows", ++ Description = "Built-in Windows text-to-speech playback." ++ } ++ ] ++ }; ++ } ++ ++ private static List MergeProviders( ++ List builtIn, ++ List configured) ++ { ++ var merged = builtIn ++ .Select(Clone) ++ .ToDictionary(p => p.Id, StringComparer.OrdinalIgnoreCase); ++ ++ foreach (var provider in configured.Where(p => !string.IsNullOrWhiteSpace(p.Id))) ++ { ++ merged[provider.Id] = Clone(provider); ++ } ++ ++ return merged.Values ++ .Where(p => p.Enabled) ++ .OrderByDescending(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase)) ++ .ThenBy(p => p.Name, StringComparer.OrdinalIgnoreCase) ++ .ToList(); ++ } ++ ++ private static VoiceProviderOption ResolveProvider(IEnumerable providers, string? providerId) ++ { ++ if (!string.IsNullOrWhiteSpace(providerId)) ++ { ++ var configured = providers.FirstOrDefault(p => string.Equals(p.Id, providerId, StringComparison.OrdinalIgnoreCase)); ++ if (configured != null) ++ { ++ return Clone(configured); ++ } ++ } ++ ++ return providers ++ .Select(Clone) ++ .FirstOrDefault(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase)) ++ ?? new VoiceProviderOption ++ { ++ Id = VoiceProviderIds.Windows, ++ Name = "Windows Speech", ++ Runtime = "windows" ++ }; ++ } ++ ++ private static VoiceProviderOption Clone(VoiceProviderOption source) ++ { ++ return new VoiceProviderOption ++ { ++ Id = source.Id, ++ Name = source.Name, ++ Runtime = source.Runtime, ++ Enabled = source.Enabled, ++ Description = source.Description ++ }; ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs b/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs +new file mode 100644 +index 0000000..3d3e982 +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs +@@ -0,0 +1,1040 @@ ++using System; ++using System.Collections.Generic; ++using System.Linq; ++using System.Threading; ++using System.Threading.Tasks; ++using OpenClaw.Shared; ++using OpenClawTray.Helpers; ++using Windows.Devices.Enumeration; ++using Windows.Foundation; ++using Windows.Media.Capture; ++using Windows.Media.Core; ++using Windows.Media.Devices; ++using Windows.Media.Playback; ++using Windows.Media.SpeechRecognition; ++using Windows.Media.SpeechSynthesis; ++ ++namespace OpenClawTray.Services; ++ ++public sealed class VoiceService : IDisposable ++{ ++ private const int HResultSpeechPrivacyDeclined = unchecked((int)0x80045509); ++ private static readonly TimeSpan TransportConnectTimeout = TimeSpan.FromSeconds(10); ++ private static readonly TimeSpan ReplyTimeout = TimeSpan.FromSeconds(45); ++ private static readonly TimeSpan DuplicateTranscriptWindow = TimeSpan.FromSeconds(2); ++ ++ private readonly IOpenClawLogger _logger; ++ private readonly SettingsManager _settings; ++ private readonly object _gate = new(); ++ ++ private VoiceStatusInfo _status; ++ private VoiceActivationMode? _runtimeModeOverride; ++ private CancellationTokenSource? _runtimeCts; ++ private OpenClawGatewayClient? _chatClient; ++ private ConnectionStatus _chatTransportStatus = ConnectionStatus.Disconnected; ++ private TaskCompletionSource? _transportReadyTcs; ++ private SpeechRecognizer? _speechRecognizer; ++ private SpeechSynthesizer? _speechSynthesizer; ++ private MediaPlayer? _mediaPlayer; ++ private bool _recognitionActive; ++ private bool _awaitingReply; ++ private bool _isSpeaking; ++ private string? _lastTranscript; ++ private DateTime _lastTranscriptUtc; ++ private bool _disposed; ++ ++ public VoiceService(IOpenClawLogger logger, SettingsManager settings) ++ { ++ _logger = logger; ++ _settings = settings; ++ _status = new VoiceStatusInfo(); ++ _status = BuildStoppedStatus(null, null); ++ } ++ ++ public VoiceStatusInfo CurrentStatus ++ { ++ get ++ { ++ lock (_gate) ++ { ++ return Clone(_status); ++ } ++ } ++ } ++ ++ public Task GetSettingsAsync() ++ { ++ lock (_gate) ++ { ++ return Task.FromResult(Clone(_settings.Voice)); ++ } ++ } ++ ++ public Task UpdateSettingsAsync(VoiceSettingsUpdateArgs update) ++ { ++ ArgumentNullException.ThrowIfNull(update); ++ ++ lock (_gate) ++ { ++ _settings.Voice = Clone(update.Settings); ++ if (update.Persist) ++ { ++ _settings.Save(); ++ } ++ ++ if (_status.Running) ++ { ++ _status = BuildRunningStatus( ++ _runtimeModeOverride ?? _settings.Voice.Mode, ++ _status.SessionKey, ++ _status.State, ++ _status.LastError); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ _status.LastWakeWordUtc = _status.LastWakeWordUtc; ++ } ++ else ++ { ++ _status = BuildStoppedStatus(_status.SessionKey, _status.LastError); ++ } ++ ++ return Task.FromResult(Clone(_settings.Voice)); ++ } ++ } ++ ++ public Task GetStatusAsync() ++ { ++ lock (_gate) ++ { ++ return Task.FromResult(Clone(_status)); ++ } ++ } ++ ++ public async Task StartAsync(VoiceStartArgs args) ++ { ++ ObjectDisposedException.ThrowIf(_disposed, this); ++ ++ args ??= new VoiceStartArgs(); ++ ++ VoiceSettings effectiveSettings; ++ VoiceActivationMode requestedMode; ++ string? sessionKey; ++ ++ lock (_gate) ++ { ++ effectiveSettings = Clone(_settings.Voice); ++ requestedMode = args.Mode ?? effectiveSettings.Mode; ++ sessionKey = args.SessionKey ?? _status.SessionKey; ++ ++ if (args.Mode.HasValue && args.Mode.Value != VoiceActivationMode.Off) ++ { ++ effectiveSettings.Enabled = true; ++ effectiveSettings.Mode = args.Mode.Value; ++ _runtimeModeOverride = args.Mode.Value; ++ } ++ else if (args.Mode == VoiceActivationMode.Off) ++ { ++ _runtimeModeOverride = null; ++ } ++ ++ if (!effectiveSettings.Enabled || requestedMode == VoiceActivationMode.Off) ++ { ++ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled"); ++ return Clone(_status); ++ } ++ } ++ ++ await StopRuntimeResourcesAsync(updateStoppedStatus: false); ++ ++ try ++ { ++ switch (requestedMode) ++ { ++ case VoiceActivationMode.AlwaysOn: ++ await StartAlwaysOnRuntimeAsync(effectiveSettings, sessionKey); ++ break; ++ case VoiceActivationMode.WakeWord: ++ lock (_gate) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.WakeWord, ++ sessionKey, ++ VoiceRuntimeState.ListeningForWakeWord, ++ "WakeWord capture is not implemented yet"); ++ } ++ _logger.Info("Voice runtime started in mode WakeWord"); ++ break; ++ default: ++ lock (_gate) ++ { ++ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled"); ++ } ++ break; ++ } ++ } ++ catch (Exception ex) ++ { ++ _logger.Error("Voice runtime start failed", ex); ++ lock (_gate) ++ { ++ _status = BuildErrorStatus(requestedMode, sessionKey, GetUserFacingErrorMessage(ex)); ++ } ++ } ++ ++ return CurrentStatus; ++ } ++ ++ public async Task StopAsync(VoiceStopArgs args) ++ { ++ args ??= new VoiceStopArgs(); ++ ++ await StopRuntimeResourcesAsync(updateStoppedStatus: false); ++ ++ lock (_gate) ++ { ++ _runtimeModeOverride = null; ++ _status = BuildStoppedStatus(_status.SessionKey, args.Reason); ++ _logger.Info($"Voice runtime stopped{(string.IsNullOrWhiteSpace(args.Reason) ? string.Empty : $": {args.Reason}")}"); ++ return Clone(_status); ++ } ++ } ++ ++ public async Task ListDevicesAsync() ++ { ++ try ++ { ++ var inputDefaultId = MediaDevice.GetDefaultAudioCaptureId(AudioDeviceRole.Default); ++ var outputDefaultId = MediaDevice.GetDefaultAudioRenderId(AudioDeviceRole.Default); ++ var results = new List(); ++ ++ var inputDevices = await DeviceInformation.FindAllAsync(DeviceClass.AudioCapture); ++ foreach (var device in inputDevices) ++ { ++ results.Add(new VoiceAudioDeviceInfo ++ { ++ DeviceId = device.Id, ++ Name = device.Name, ++ IsDefault = string.Equals(device.Id, inputDefaultId, StringComparison.Ordinal), ++ IsInput = true ++ }); ++ } ++ ++ var outputDevices = await DeviceInformation.FindAllAsync(DeviceClass.AudioRender); ++ foreach (var device in outputDevices) ++ { ++ results.Add(new VoiceAudioDeviceInfo ++ { ++ DeviceId = device.Id, ++ Name = device.Name, ++ IsDefault = string.Equals(device.Id, outputDefaultId, StringComparison.Ordinal), ++ IsOutput = true ++ }); ++ } ++ ++ return results ++ .OrderByDescending(d => d.IsDefault) ++ .ThenBy(d => d.IsInput ? 0 : 1) ++ .ThenBy(d => d.Name, StringComparer.OrdinalIgnoreCase) ++ .ToArray(); ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"Voice device enumeration failed: {ex.Message}"); ++ return ++ [ ++ new VoiceAudioDeviceInfo ++ { ++ DeviceId = "default-input", ++ Name = "System default microphone", ++ IsDefault = true, ++ IsInput = true ++ }, ++ new VoiceAudioDeviceInfo ++ { ++ DeviceId = "default-output", ++ Name = "System default speaker", ++ IsDefault = true, ++ IsOutput = true ++ } ++ ]; ++ } ++ } ++ ++ public VoiceProviderCatalog GetProviderCatalog() ++ { ++ return VoiceProviderCatalogService.LoadCatalog(_logger); ++ } ++ ++ public void Dispose() ++ { ++ if (_disposed) ++ { ++ return; ++ } ++ ++ _disposed = true; ++ _ = StopRuntimeResourcesAsync(updateStoppedStatus: true); ++ } ++ ++ private async Task StartAlwaysOnRuntimeAsync(VoiceSettings settings, string? sessionKey) ++ { ++ var selectedSpeechToText = VoiceProviderCatalogService.ResolveSpeechToTextProvider( ++ settings.SpeechToTextProviderId, ++ _logger); ++ var selectedTextToSpeech = VoiceProviderCatalogService.ResolveTextToSpeechProvider( ++ settings.TextToSpeechProviderId, ++ _logger); ++ var fallbackMessage = BuildProviderFallbackMessage(selectedSpeechToText, selectedTextToSpeech); ++ ++ await EnsureMicrophoneConsentAsync(); ++ ++ var runtimeCts = new CancellationTokenSource(); ++ var recognizer = await CreateSpeechRecognizerAsync(settings); ++ var synthesizer = new SpeechSynthesizer(); ++ var player = new MediaPlayer(); ++ ++ if (!string.IsNullOrWhiteSpace(settings.InputDeviceId)) ++ { ++ _logger.Warn("Selected input device is saved, but AlwaysOn currently uses the system speech input device."); ++ } ++ ++ if (!string.IsNullOrWhiteSpace(settings.OutputDeviceId)) ++ { ++ _logger.Warn("Selected output device is saved, but AlwaysOn currently uses the default speech output device."); ++ } ++ ++ recognizer.ContinuousRecognitionSession.ResultGenerated += OnSpeechResultGenerated; ++ recognizer.ContinuousRecognitionSession.Completed += OnSpeechRecognitionCompleted; ++ ++ lock (_gate) ++ { ++ _runtimeCts = runtimeCts; ++ _speechRecognizer = recognizer; ++ _speechSynthesizer = synthesizer; ++ _mediaPlayer = player; ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ sessionKey, ++ VoiceRuntimeState.Arming, ++ fallbackMessage); ++ } ++ ++ await EnsureChatTransportAsync(runtimeCts.Token); ++ await StartRecognitionSessionAsync(); ++ ++ lock (_gate) ++ { ++ if (_status.Running) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ sessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ fallbackMessage); ++ } ++ } ++ ++ _logger.Info("Voice runtime started in mode AlwaysOn"); ++ } ++ ++ private async Task CreateSpeechRecognizerAsync(VoiceSettings settings) ++ { ++ var recognizer = new SpeechRecognizer(); ++ recognizer.Timeouts.EndSilenceTimeout = TimeSpan.FromMilliseconds(settings.AlwaysOn.EndSilenceMs); ++ recognizer.Timeouts.InitialSilenceTimeout = TimeSpan.FromSeconds(10); ++ recognizer.Timeouts.BabbleTimeout = TimeSpan.FromSeconds(4); ++ recognizer.Constraints.Add(new SpeechRecognitionTopicConstraint(SpeechRecognitionScenario.Dictation, "always-on-dictation")); ++ ++ var compilation = await recognizer.CompileConstraintsAsync(); ++ if (compilation.Status != SpeechRecognitionResultStatus.Success) ++ { ++ recognizer.Dispose(); ++ throw new InvalidOperationException($"Speech recognizer unavailable: {compilation.Status}"); ++ } ++ ++ return recognizer; ++ } ++ ++ private async Task EnsureMicrophoneConsentAsync() ++ { ++ if (!PackageHelper.IsPackaged) ++ { ++ return; ++ } ++ ++ using var capture = new MediaCapture(); ++ var initSettings = new MediaCaptureInitializationSettings ++ { ++ StreamingCaptureMode = StreamingCaptureMode.Audio, ++ SharingMode = MediaCaptureSharingMode.SharedReadOnly, ++ MemoryPreference = MediaCaptureMemoryPreference.Cpu ++ }; ++ ++ await capture.InitializeAsync(initSettings); ++ } ++ ++ private async Task EnsureChatTransportAsync(CancellationToken cancellationToken) ++ { ++ OpenClawGatewayClient? existingClient; ++ ConnectionStatus existingStatus; ++ ++ lock (_gate) ++ { ++ existingClient = _chatClient; ++ existingStatus = _chatTransportStatus; ++ if (existingStatus == ConnectionStatus.Connected) ++ { ++ return; ++ } ++ ++ _transportReadyTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ ++ if (existingClient == null) ++ { ++ _chatClient = new OpenClawGatewayClient(_settings.GatewayUrl, _settings.Token, _logger); ++ _chatClient.StatusChanged += OnChatTransportStatusChanged; ++ _chatClient.NotificationReceived += OnChatNotificationReceived; ++ existingClient = _chatClient; ++ _chatTransportStatus = ConnectionStatus.Connecting; ++ } ++ } ++ ++ if (existingStatus == ConnectionStatus.Disconnected || existingClient != _chatClient) ++ { ++ await existingClient!.ConnectAsync(); ++ } ++ ++ Task readyTask; ++ lock (_gate) ++ { ++ readyTask = _transportReadyTcs?.Task ?? Task.CompletedTask; ++ } ++ ++ using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); ++ timeoutCts.CancelAfter(TransportConnectTimeout); ++ ++ var completed = await Task.WhenAny(readyTask, Task.Delay(Timeout.InfiniteTimeSpan, timeoutCts.Token)); ++ if (completed != readyTask) ++ { ++ throw new TimeoutException("Timed out connecting voice chat transport."); ++ } ++ ++ await readyTask; ++ } ++ ++ private async Task StartRecognitionSessionAsync() ++ { ++ SpeechRecognizer? recognizer; ++ ++ lock (_gate) ++ { ++ recognizer = _speechRecognizer; ++ if (recognizer == null || _recognitionActive) ++ { ++ return; ++ } ++ } ++ ++ await recognizer.ContinuousRecognitionSession.StartAsync(); ++ ++ lock (_gate) ++ { ++ _recognitionActive = true; ++ if (_status.Running && !_awaitingReply && !_isSpeaking) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ null); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ } ++ } ++ } ++ ++ private async Task StopRecognitionSessionAsync() ++ { ++ SpeechRecognizer? recognizer; ++ ++ lock (_gate) ++ { ++ recognizer = _speechRecognizer; ++ if (recognizer == null || !_recognitionActive) ++ { ++ return; ++ } ++ ++ _recognitionActive = false; ++ } ++ ++ try ++ { ++ await recognizer.ContinuousRecognitionSession.CancelAsync(); ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"Voice recognition stop failed: {ex.Message}"); ++ } ++ } ++ ++ private async void OnSpeechResultGenerated( ++ SpeechContinuousRecognitionSession sender, ++ SpeechContinuousRecognitionResultGeneratedEventArgs args) ++ { ++ try ++ { ++ var result = args.Result; ++ var text = result.Text?.Trim(); ++ if (string.IsNullOrWhiteSpace(text)) ++ { ++ return; ++ } ++ ++ if (result.Status != SpeechRecognitionResultStatus.Success || ++ result.Confidence == SpeechRecognitionConfidence.Rejected) ++ { ++ return; ++ } ++ ++ await HandleRecognizedTextAsync(text); ++ } ++ catch (Exception ex) ++ { ++ _logger.Error("Voice recognition handler failed", ex); ++ lock (_gate) ++ { ++ if (_status.Running) ++ { ++ _status = BuildErrorStatus(VoiceActivationMode.AlwaysOn, _status.SessionKey, GetUserFacingErrorMessage(ex)); ++ } ++ } ++ } ++ } ++ ++ private async Task HandleRecognizedTextAsync(string text) ++ { ++ CancellationToken cancellationToken; ++ ++ lock (_gate) ++ { ++ if (_runtimeCts == null || _status.Mode != VoiceActivationMode.AlwaysOn || !_status.Running) ++ { ++ return; ++ } ++ ++ if (_awaitingReply || _isSpeaking) ++ { ++ return; ++ } ++ ++ if (string.Equals(text, _lastTranscript, StringComparison.OrdinalIgnoreCase) && ++ DateTime.UtcNow - _lastTranscriptUtc < DuplicateTranscriptWindow) ++ { ++ return; ++ } ++ ++ _lastTranscript = text; ++ _lastTranscriptUtc = DateTime.UtcNow; ++ _awaitingReply = true; ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.AwaitingResponse, ++ _status.LastError); ++ _status.LastUtteranceUtc = DateTime.UtcNow; ++ cancellationToken = _runtimeCts.Token; ++ } ++ ++ await StopRecognitionSessionAsync(); ++ ++ try ++ { ++ await EnsureChatTransportAsync(cancellationToken); ++ ++ OpenClawGatewayClient? client; ++ lock (_gate) ++ { ++ client = _chatClient; ++ } ++ ++ if (client == null) ++ { ++ throw new InvalidOperationException("Voice chat transport is unavailable."); ++ } ++ ++ _logger.Info($"Voice transcript captured: {text}"); ++ await client.SendChatMessageAsync(text); ++ _ = MonitorReplyTimeoutAsync(text, cancellationToken); ++ } ++ catch (Exception ex) ++ { ++ _logger.Error("Voice transcript submit failed", ex); ++ ++ lock (_gate) ++ { ++ _awaitingReply = false; ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ GetUserFacingErrorMessage(ex)); ++ } ++ ++ await StartRecognitionSessionAsync(); ++ } ++ } ++ ++ private async Task MonitorReplyTimeoutAsync(string transcript, CancellationToken cancellationToken) ++ { ++ try ++ { ++ await Task.Delay(ReplyTimeout, cancellationToken); ++ ++ var shouldResume = false; ++ lock (_gate) ++ { ++ if (_awaitingReply && ++ string.Equals(_lastTranscript, transcript, StringComparison.OrdinalIgnoreCase)) ++ { ++ _awaitingReply = false; ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ "Timed out waiting for an assistant reply."); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ shouldResume = true; ++ } ++ } ++ ++ if (shouldResume) ++ { ++ await StartRecognitionSessionAsync(); ++ } ++ } ++ catch (OperationCanceledException) ++ { ++ } ++ } ++ ++ private async void OnChatNotificationReceived(object? sender, OpenClawNotification notification) ++ { ++ if (!notification.IsChat || string.IsNullOrWhiteSpace(notification.Message)) ++ { ++ return; ++ } ++ ++ string text; ++ ++ lock (_gate) ++ { ++ if (!_awaitingReply || !_status.Running || _status.Mode != VoiceActivationMode.AlwaysOn) ++ { ++ return; ++ } ++ ++ _awaitingReply = false; ++ _isSpeaking = true; ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.PlayingResponse, ++ _status.LastError); ++ text = notification.Message; ++ } ++ ++ try ++ { ++ await SpeakTextAsync(text); ++ } ++ catch (Exception ex) ++ { ++ _logger.Error("Voice reply playback failed", ex); ++ lock (_gate) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ GetUserFacingErrorMessage(ex)); ++ } ++ } ++ finally ++ { ++ lock (_gate) ++ { ++ _isSpeaking = false; ++ if (_status.Running) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ _status.LastError); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ } ++ } ++ ++ try ++ { ++ await StartRecognitionSessionAsync(); ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"Voice recognition resume failed: {ex.Message}"); ++ } ++ } ++ } ++ ++ private async Task SpeakTextAsync(string text) ++ { ++ SpeechSynthesizer? synthesizer; ++ MediaPlayer? player; ++ ++ lock (_gate) ++ { ++ synthesizer = _speechSynthesizer; ++ player = _mediaPlayer; ++ } ++ ++ if (synthesizer == null || player == null) ++ { ++ throw new InvalidOperationException("Speech playback is not ready."); ++ } ++ ++ using var stream = await synthesizer.SynthesizeTextToStreamAsync(text); ++ var playbackEnded = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); ++ ++ TypedEventHandler? endedHandler = null; ++ TypedEventHandler? failedHandler = null; ++ ++ endedHandler = (sender, _) => playbackEnded.TrySetResult(true); ++ failedHandler = (sender, args) => playbackEnded.TrySetException(new InvalidOperationException(args.ErrorMessage)); ++ ++ player.MediaEnded += endedHandler; ++ player.MediaFailed += failedHandler; ++ ++ try ++ { ++ player.Source = MediaSource.CreateFromStream(stream, stream.ContentType); ++ player.Play(); ++ await playbackEnded.Task; ++ } ++ finally ++ { ++ player.MediaEnded -= endedHandler; ++ player.MediaFailed -= failedHandler; ++ player.Source = null; ++ } ++ } ++ ++ private async void OnSpeechRecognitionCompleted( ++ SpeechContinuousRecognitionSession sender, ++ SpeechContinuousRecognitionCompletedEventArgs args) ++ { ++ try ++ { ++ CancellationToken token; ++ var shouldRestart = false; ++ ++ lock (_gate) ++ { ++ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested) ++ { ++ return; ++ } ++ ++ _recognitionActive = false; ++ token = _runtimeCts.Token; ++ shouldRestart = _status.Running && ++ _status.Mode == VoiceActivationMode.AlwaysOn && ++ !_awaitingReply && ++ !_isSpeaking; ++ } ++ ++ if (shouldRestart && !token.IsCancellationRequested) ++ { ++ await Task.Delay(250, token); ++ await StartRecognitionSessionAsync(); ++ } ++ } ++ catch (OperationCanceledException) ++ { ++ } ++ catch (Exception ex) ++ { ++ _logger.Warn($"Voice recognition completion handler failed: {ex.Message}"); ++ } ++ } ++ ++ private void OnChatTransportStatusChanged(object? sender, ConnectionStatus status) ++ { ++ lock (_gate) ++ { ++ _chatTransportStatus = status; ++ ++ if (status == ConnectionStatus.Connected) ++ { ++ _transportReadyTcs?.TrySetResult(true); ++ ++ if (_status.Running && ++ _status.Mode == VoiceActivationMode.AlwaysOn && ++ !_awaitingReply && ++ !_isSpeaking) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.ListeningContinuously, ++ _status.LastError); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ } ++ } ++ else if (status == ConnectionStatus.Error) ++ { ++ _transportReadyTcs?.TrySetException( ++ new InvalidOperationException("Voice chat transport failed to connect.")); ++ ++ if (_status.Running && _status.Mode == VoiceActivationMode.AlwaysOn) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.Arming, ++ "Voice chat transport failed."); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ } ++ } ++ else if (status == ConnectionStatus.Disconnected) ++ { ++ if (_status.Running && _status.Mode == VoiceActivationMode.AlwaysOn) ++ { ++ _status = BuildRunningStatus( ++ VoiceActivationMode.AlwaysOn, ++ _status.SessionKey, ++ VoiceRuntimeState.Arming, ++ "Voice chat transport disconnected."); ++ _status.LastUtteranceUtc = _status.LastUtteranceUtc; ++ } ++ } ++ } ++ } ++ ++ private async Task StopRuntimeResourcesAsync(bool updateStoppedStatus) ++ { ++ CancellationTokenSource? runtimeCts; ++ OpenClawGatewayClient? chatClient; ++ SpeechRecognizer? recognizer; ++ SpeechSynthesizer? synthesizer; ++ MediaPlayer? player; ++ var sessionKey = CurrentStatus.SessionKey; ++ ++ lock (_gate) ++ { ++ runtimeCts = _runtimeCts; ++ _runtimeCts = null; ++ ++ chatClient = _chatClient; ++ _chatClient = null; ++ _chatTransportStatus = ConnectionStatus.Disconnected; ++ _transportReadyTcs = null; ++ ++ recognizer = _speechRecognizer; ++ _speechRecognizer = null; ++ _recognitionActive = false; ++ ++ synthesizer = _speechSynthesizer; ++ _speechSynthesizer = null; ++ ++ player = _mediaPlayer; ++ _mediaPlayer = null; ++ ++ _awaitingReply = false; ++ _isSpeaking = false; ++ } ++ ++ try { runtimeCts?.Cancel(); } catch { } ++ ++ if (recognizer != null) ++ { ++ recognizer.ContinuousRecognitionSession.ResultGenerated -= OnSpeechResultGenerated; ++ recognizer.ContinuousRecognitionSession.Completed -= OnSpeechRecognitionCompleted; ++ ++ try { await recognizer.ContinuousRecognitionSession.CancelAsync(); } catch { } ++ try { recognizer.Dispose(); } catch { } ++ } ++ ++ if (player != null) ++ { ++ try { player.Pause(); } catch { } ++ try { player.Source = null; } catch { } ++ try { player.Dispose(); } catch { } ++ } ++ ++ try { synthesizer?.Dispose(); } catch { } ++ ++ if (chatClient != null) ++ { ++ chatClient.StatusChanged -= OnChatTransportStatusChanged; ++ chatClient.NotificationReceived -= OnChatNotificationReceived; ++ try { await chatClient.DisconnectAsync(); } catch { } ++ try { chatClient.Dispose(); } catch { } ++ } ++ ++ try { runtimeCts?.Dispose(); } catch { } ++ ++ if (updateStoppedStatus) ++ { ++ lock (_gate) ++ { ++ _status = BuildStoppedStatus(sessionKey, "Disposed"); ++ } ++ } ++ } ++ ++ private VoiceStatusInfo BuildRunningStatus( ++ VoiceActivationMode mode, ++ string? sessionKey, ++ VoiceRuntimeState state, ++ string? lastError) ++ { ++ var settings = _settings.Voice; ++ return new VoiceStatusInfo ++ { ++ Available = true, ++ Running = true, ++ Mode = mode, ++ State = state, ++ SessionKey = sessionKey, ++ InputDeviceId = settings.InputDeviceId, ++ OutputDeviceId = settings.OutputDeviceId, ++ WakeWordModelId = settings.WakeWord.ModelId, ++ WakeWordLoaded = mode == VoiceActivationMode.WakeWord, ++ LastWakeWordUtc = _status.LastWakeWordUtc, ++ LastUtteranceUtc = _status.LastUtteranceUtc, ++ LastError = lastError ++ }; ++ } ++ ++ private VoiceStatusInfo BuildStoppedStatus(string? sessionKey, string? reason) ++ { ++ var settings = _settings.Voice; ++ return new VoiceStatusInfo ++ { ++ Available = true, ++ Running = false, ++ Mode = _runtimeModeOverride ?? settings.Mode, ++ State = VoiceRuntimeState.Stopped, ++ SessionKey = sessionKey, ++ InputDeviceId = settings.InputDeviceId, ++ OutputDeviceId = settings.OutputDeviceId, ++ WakeWordModelId = settings.WakeWord.ModelId, ++ WakeWordLoaded = false, ++ LastWakeWordUtc = _status.LastWakeWordUtc, ++ LastUtteranceUtc = _status.LastUtteranceUtc, ++ LastError = reason ++ }; ++ } ++ ++ private VoiceStatusInfo BuildErrorStatus(VoiceActivationMode mode, string? sessionKey, string? reason) ++ { ++ var status = BuildRunningStatus(mode, sessionKey, VoiceRuntimeState.Error, reason); ++ status.Running = false; ++ return status; ++ } ++ ++ private static VoiceSettings Clone(VoiceSettings source) ++ { ++ return new VoiceSettings ++ { ++ Mode = source.Mode, ++ Enabled = source.Enabled, ++ SpeechToTextProviderId = source.SpeechToTextProviderId, ++ TextToSpeechProviderId = source.TextToSpeechProviderId, ++ InputDeviceId = source.InputDeviceId, ++ OutputDeviceId = source.OutputDeviceId, ++ SampleRateHz = source.SampleRateHz, ++ CaptureChunkMs = source.CaptureChunkMs, ++ BargeInEnabled = source.BargeInEnabled, ++ WakeWord = new VoiceWakeWordSettings ++ { ++ Engine = source.WakeWord.Engine, ++ ModelId = source.WakeWord.ModelId, ++ TriggerThreshold = source.WakeWord.TriggerThreshold, ++ TriggerCooldownMs = source.WakeWord.TriggerCooldownMs, ++ PreRollMs = source.WakeWord.PreRollMs, ++ EndSilenceMs = source.WakeWord.EndSilenceMs ++ }, ++ AlwaysOn = new VoiceAlwaysOnSettings ++ { ++ MinSpeechMs = source.AlwaysOn.MinSpeechMs, ++ EndSilenceMs = source.AlwaysOn.EndSilenceMs, ++ MaxUtteranceMs = source.AlwaysOn.MaxUtteranceMs, ++ AutoSubmit = source.AlwaysOn.AutoSubmit ++ } ++ }; ++ } ++ ++ private static VoiceStatusInfo Clone(VoiceStatusInfo source) ++ { ++ return new VoiceStatusInfo ++ { ++ Available = source.Available, ++ Running = source.Running, ++ Mode = source.Mode, ++ State = source.State, ++ SessionKey = source.SessionKey, ++ InputDeviceId = source.InputDeviceId, ++ OutputDeviceId = source.OutputDeviceId, ++ WakeWordModelId = source.WakeWordModelId, ++ WakeWordLoaded = source.WakeWordLoaded, ++ LastWakeWordUtc = source.LastWakeWordUtc, ++ LastUtteranceUtc = source.LastUtteranceUtc, ++ LastError = source.LastError ++ }; ++ } ++ ++ private static string? BuildProviderFallbackMessage( ++ VoiceProviderOption speechToTextProvider, ++ VoiceProviderOption textToSpeechProvider) ++ { ++ var fallbacks = new List(); ++ ++ if (!VoiceProviderCatalogService.SupportsWindowsRuntime(speechToTextProvider.Id)) ++ { ++ fallbacks.Add($"STT '{speechToTextProvider.Name}' is not implemented yet; using Windows Speech Recognition."); ++ } ++ ++ if (!VoiceProviderCatalogService.SupportsWindowsRuntime(textToSpeechProvider.Id)) ++ { ++ fallbacks.Add($"TTS '{textToSpeechProvider.Name}' is not implemented yet; using Windows Speech Synthesis."); ++ } ++ ++ return fallbacks.Count == 0 ? null : string.Join(" ", fallbacks); ++ } ++ ++ private static string GetUserFacingErrorMessage(Exception ex) ++ { ++ if (IsSpeechPrivacyDeclined(ex)) ++ { ++ return "Windows online speech recognition is disabled. Open Settings > Privacy & security > Speech and turn on Online speech recognition, then restart Voice Mode."; ++ } ++ ++ if (ex is UnauthorizedAccessException) ++ { ++ return "Microphone access is blocked. Open Settings > Privacy & security > Microphone and allow desktop apps to use the microphone."; ++ } ++ ++ return ex.Message; ++ } ++ ++ private static bool IsSpeechPrivacyDeclined(Exception ex) ++ { ++ if (ex.HResult == HResultSpeechPrivacyDeclined) ++ { ++ return true; ++ } ++ ++ return ex.Message.Contains("speech privacy policy", StringComparison.OrdinalIgnoreCase) || ++ ex.Message.Contains("online speech recognition", StringComparison.OrdinalIgnoreCase); ++ } ++} +diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml +new file mode 100644 +index 0000000..57cb962 +--- /dev/null ++++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml +@@ -0,0 +1,92 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++