tiann · Shujakuinkuraudo · Apr 14, 2026
diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts
@@ -12,6 +12,19 @@ const tokenRequestSchema = z.object({
     customApiKey: z.string().optional()
 })
 
+const scribeTokenRequestSchema = z.object({
+    customApiKey: z.string().optional()
+})
+
+const transcriptionModelSchema = z.enum(['scribe_v1', 'scribe_v2'])
+
+const SUPPORTED_ELEVENLABS_LANGUAGE_CODES = new Set([
+    'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko',
+    'pt', 'pt-br', 'it', 'es', 'id', 'nl', 'tr', 'pl', 'sv', 'bg',
+    'ro', 'ar', 'cs', 'el', 'fi', 'ms', 'da', 'ta', 'uk', 'ru',
+    'hu', 'hr', 'sk', 'no', 'vi', 'tl'
+])
+
 // Cache for auto-created agent IDs (keyed by API key hash)
 const agentIdCache = new Map<string, string>()
 
@@ -20,6 +33,35 @@ interface ElevenLabsAgent {
     name: string
 }
 
+interface ElevenLabsTool {
+    id: string
+    tool_config?: {
+        name?: string
+        type?: string
+    }
+}
+
+function normalizeTranscriptionLanguageCode(raw: string | null): string | undefined {
+    if (!raw) return undefined
+
+    const normalized = raw.trim().toLowerCase()
+    if (!normalized) return undefined
+
+    if (SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(normalized)) {
+        return normalized
+    }
+
+    if (normalized === 'pt-br' || normalized.startsWith('pt-br-')) {
+        return 'pt-br'
+    }
+
+    const base = normalized.split(/[-_]/)[0]
+    if (base && SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(base)) {
+        return base
+    }
+
+    return undefined
+}
 /**
  * Find an existing "Hapi Voice Assistant" agent
  */
@@ -193,5 +235,113 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
         }
     })
 
+    app.post('/voice/transcribe', async (c) => {
+        const formData = await c.req.formData().catch(() => null)
+        if (!formData) {
+            return c.json({ error: 'Invalid form data' }, 400)
+        }
+
+        const file = formData.get('file')
+        const modelIdRaw = formData.get('modelId')
+        const languageCodeRaw = formData.get('languageCode')
+
+        if (!(file instanceof File)) {
+            return c.json({ error: 'Missing audio file' }, 400)
+        }
+
+        const modelIdParsed = transcriptionModelSchema.safeParse(
+            typeof modelIdRaw === 'string' ? modelIdRaw : 'scribe_v2'
+        )
+        if (!modelIdParsed.success) {
+            return c.json({ error: 'Invalid modelId' }, 400)
+        }
+
+        const apiKey = process.env.ELEVENLABS_API_KEY
+        if (!apiKey) {
+            return c.json({ error: 'ElevenLabs API key not configured' }, 400)
+        }
+
+        const upstreamFormData = new FormData()
+        upstreamFormData.set('model_id', modelIdParsed.data)
+        upstreamFormData.set('file', file, file.name || 'speech.webm')
+        const languageCode = typeof languageCodeRaw === 'string'
+            ? normalizeTranscriptionLanguageCode(languageCodeRaw)
+            : undefined
+        if (languageCode && modelIdParsed.data === 'scribe_v2') {
+            upstreamFormData.set('language_code', languageCode)
+        }
+
+        try {
+            const response = await fetch(`${ELEVENLABS_API_BASE}/speech-to-text`, {
+                method: 'POST',
+                headers: {
+                    'xi-api-key': apiKey,
+                    'Accept': 'application/json'
+                },
+                body: upstreamFormData
+            })
+
+            if (!response.ok) {
+                const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
+                const errorMessage = typeof errorData.detail === 'string'
+                    ? errorData.detail
+                    : errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
+                return c.json({ error: errorMessage }, 500)
+            }
+
+            const data = await response.json() as { text?: string; language_code?: string }
+            return c.json({
+                text: data.text ?? '',
+                languageCode: data.language_code
+            })
+        } catch (error) {
+            return c.json({
+                error: error instanceof Error ? error.message : 'Network error'
+            }, 500)
+        }
+    })
+
+    app.post('/voice/scribe-token', async (c) => {
+        const json = await c.req.json().catch(() => null)
+        const parsed = scribeTokenRequestSchema.safeParse(json ?? {})
+        if (!parsed.success) {
+            return c.json({ error: 'Invalid request body' }, 400)
+        }
+
+        const apiKey = parsed.data.customApiKey || process.env.ELEVENLABS_API_KEY
+        if (!apiKey) {
+            return c.json({ error: 'ElevenLabs API key not configured' }, 400)
+        }
+
+        try {
+            const response = await fetch(`${ELEVENLABS_API_BASE}/single-use-token/realtime_scribe`, {
+                method: 'POST',
+                headers: {
+                    'xi-api-key': apiKey,
+                    'Accept': 'application/json'
+                }
+            })
+
+            if (!response.ok) {
+                const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
+                const errorMessage = typeof errorData.detail === 'string'
+                    ? errorData.detail
+                    : errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
+                return c.json({ error: errorMessage }, 500)
+            }
+
+            const data = await response.json() as { token?: string }
+            if (!data.token) {
+                return c.json({ error: 'No token in ElevenLabs response' }, 500)
+            }
+
+            return c.json({ token: data.token })
+        } catch (error) {
+            return c.json({
+                error: error instanceof Error ? error.message : 'Network error'
+            }, 500)
+        }
+    })
+
     return app
 }
diff --git a/web/src/api/client.ts b/web/src/api/client.ts
@@ -19,6 +19,8 @@ import type {
     SpawnResponse,
     UploadFileResponse,
     VisibilityPayload,
+    VoiceScribeTokenResponse,
+    VoiceTranscriptionResponse,
     SessionResponse,
     SessionsResponse
 } from '@/types/api'
@@ -94,7 +96,7 @@ export class ApiClient {
         if (authToken) {
             headers.set('authorization', `Bearer ${authToken}`)
         }
-        if (init?.body !== undefined && !headers.has('content-type')) {
+        if (init?.body !== undefined && !(init.body instanceof FormData) && !headers.has('content-type')) {
             headers.set('content-type', 'application/json')
         }
 
@@ -443,4 +445,28 @@ export class ApiClient {
             body: JSON.stringify(options || {})
         })
     }
+
+    async transcribeVoice(
+        file: File,
+        options?: { modelId?: 'scribe_v1' | 'scribe_v2'; languageCode?: string }
+    ): Promise<VoiceTranscriptionResponse> {
+        const formData = new FormData()
+        formData.set('file', file)
+        formData.set('modelId', options?.modelId ?? 'scribe_v2')
+        if (options?.languageCode) {
+            formData.set('languageCode', options.languageCode)
+        }
+
+        return await this.request('/api/voice/transcribe', {
+            method: 'POST',
+            body: formData
+        })
+    }
+
+    async fetchVoiceScribeToken(): Promise<VoiceScribeTokenResponse> {
+        return await this.request('/api/voice/scribe-token', {
+            method: 'POST',
+            body: JSON.stringify({})
+        })
+    }
 }
diff --git a/web/src/components/AssistantChat/HappyComposer.tsx b/web/src/components/AssistantChat/HappyComposer.tsx
@@ -12,6 +12,7 @@ import {
     useRef,
     useState
 } from 'react'
+import type { ApiClient } from '@/api/client'
 import type { AgentState, CodexCollaborationMode, PermissionMode } from '@/types/api'
 import type { Suggestion } from '@/hooks/useActiveSuggestions'
 import type { ConversationStatus } from '@/realtime/types'
@@ -22,6 +23,10 @@ import { usePlatform } from '@/hooks/usePlatform'
 import { usePWAInstall } from '@/hooks/usePWAInstall'
 import { supportsEffort, supportsModelChange } from '@hapi/protocol'
 import { markSkillUsed } from '@/lib/recent-skills'
+import { useComposerDraft } from '@/hooks/useComposerDraft'
+import { useElevenLabsTranscription } from '@/hooks/useElevenLabsTranscription'
+import { useSpeechToText } from '@/hooks/useSpeechToText'
+import { useVoiceMode } from '@/hooks/useVoiceMode'
 import { FloatingOverlay } from '@/components/ChatInput/FloatingOverlay'
 import { Autocomplete } from '@/components/ChatInput/Autocomplete'
 import { StatusBar } from '@/components/AssistantChat/StatusBar'
@@ -64,6 +69,7 @@ export function HappyComposer(props: {
     terminalUnsupported?: boolean
     autocompletePrefixes?: string[]
     autocompleteSuggestions?: (query: string) => Promise<Suggestion[]>
+    voiceTranscriptionApi?: ApiClient
     // Voice assistant props
     voiceStatus?: ConversationStatus
     voiceMicMuted?: boolean
@@ -96,6 +102,7 @@ export function HappyComposer(props: {
         terminalUnsupported = false,
         autocompletePrefixes = ['@', '/', '$'],
         autocompleteSuggestions = defaultSuggestionHandler,
+        voiceTranscriptionApi,
         voiceStatus = 'disconnected',
         voiceMicMuted = false,
         onVoiceToggle,
@@ -165,6 +172,7 @@ export function HappyComposer(props: {
     }, [controlledByUser])
 
     const { haptic: platformHaptic, isTouch } = usePlatform()
+    const { voiceMode } = useVoiceMode()
     const { isStandalone, isIOS } = usePWAInstall()
     const isIOSPWA = isIOS && isStandalone
     const bottomPaddingClass = isIOSPWA ? 'pb-0' : 'pb-3'
@@ -185,6 +193,34 @@ export function HappyComposer(props: {
         }
     }, [platformHaptic])
 
+    const dictation = useSpeechToText({
+        getCurrentText: () => composerText,
+        onTextChange: (text) => api.composer().setText(text)
+    })
+    const elevenLabsDictation = useElevenLabsTranscription({
+        api: voiceTranscriptionApi ?? null,
+        getCurrentText: () => composerText,
+        onTextChange: (text) => api.composer().setText(text)
+    })
+
+    const effectiveVoiceStatus = voiceMode === 'dictation-local'
+        ? dictation.status
+        : voiceMode === 'dictation-elevenlabs'
+            ? elevenLabsDictation.status
+            : voiceStatus
+    const effectiveVoiceEnabled = voiceMode === 'dictation-local'
+        ? dictation.supported
+        : voiceMode === 'dictation-elevenlabs'
+            ? elevenLabsDictation.supported
+            : Boolean(onVoiceToggle)
+    const effectiveVoiceMicMuted = voiceMode === 'assistant' ? voiceMicMuted : false
+    const effectiveOnVoiceToggle = voiceMode === 'dictation-local'
+        ? dictation.toggle
+        : voiceMode === 'dictation-elevenlabs'
+            ? elevenLabsDictation.toggle
+            : onVoiceToggle
+    const effectiveOnVoiceMicToggle = voiceMode === 'assistant' ? onVoiceMicToggle : undefined
+
     const handleSuggestionSelect = useCallback((index: number) => {
         const suggestion = suggestions[index]
         if (!suggestion || !textareaRef.current) return
@@ -483,7 +519,7 @@ export function HappyComposer(props: {
         || showEffortSettings
     )
     const showAbortButton = true
-    const voiceEnabled = Boolean(onVoiceToggle)
+    const voiceEnabled = effectiveVoiceEnabled
 
     const handleSend = useCallback(() => {
         api.composer().send()
@@ -759,7 +795,7 @@ export function HappyComposer(props: {
                         permissionMode={permissionMode}
                         collaborationMode={collaborationMode}
                         agentFlavor={agentFlavor}
-                        voiceStatus={voiceStatus}
+                        voiceStatus={effectiveVoiceStatus}
                     />
 
                     <div className="overflow-hidden rounded-[20px] bg-[var(--app-secondary-bg)]">
@@ -804,10 +840,10 @@ export function HappyComposer(props: {
                             isSwitching={isSwitching}
                             onSwitch={handleSwitch}
                             voiceEnabled={voiceEnabled}
-                            voiceStatus={voiceStatus}
-                            voiceMicMuted={voiceMicMuted}
-                            onVoiceToggle={onVoiceToggle ?? (() => {})}
-                            onVoiceMicToggle={onVoiceMicToggle}
+                            voiceStatus={effectiveVoiceStatus}
+                            voiceMicMuted={effectiveVoiceMicMuted}
+                            onVoiceToggle={effectiveOnVoiceToggle ?? (() => {})}
+                            onVoiceMicToggle={effectiveOnVoiceMicToggle}
                             onSend={handleSend}
                         />
                     </div>

diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx
@@ -419,6 +419,7 @@ export function SessionChat(props: {
                         onTerminal={props.session.active && terminalSupported ? handleViewTerminal : undefined}
                         terminalUnsupported={props.session.active && !terminalSupported}
                         autocompleteSuggestions={props.autocompleteSuggestions}
+                        voiceTranscriptionApi={props.api}
                         voiceStatus={voice?.status}
                         voiceMicMuted={voice?.micMuted}
                         onVoiceToggle={voice ? handleVoiceToggle : undefined}