Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions hub/src/web/routes/voice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ const tokenRequestSchema = z.object({
customApiKey: z.string().optional()
})

const scribeTokenRequestSchema = z.object({
customApiKey: z.string().optional()
})

const transcriptionModelSchema = z.enum(['scribe_v1', 'scribe_v2'])

const SUPPORTED_ELEVENLABS_LANGUAGE_CODES = new Set([
'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko',
'pt', 'pt-br', 'it', 'es', 'id', 'nl', 'tr', 'pl', 'sv', 'bg',
'ro', 'ar', 'cs', 'el', 'fi', 'ms', 'da', 'ta', 'uk', 'ru',
'hu', 'hr', 'sk', 'no', 'vi', 'tl'
])

// Cache for auto-created agent IDs (keyed by API key hash)
const agentIdCache = new Map<string, string>()

Expand All @@ -20,6 +33,35 @@ interface ElevenLabsAgent {
name: string
}

interface ElevenLabsTool {
id: string
tool_config?: {
name?: string
type?: string
}
}

function normalizeTranscriptionLanguageCode(raw: string | null): string | undefined {
if (!raw) return undefined

const normalized = raw.trim().toLowerCase()
if (!normalized) return undefined

if (SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(normalized)) {
return normalized
}

if (normalized === 'pt-br' || normalized.startsWith('pt-br-')) {
return 'pt-br'
}

const base = normalized.split(/[-_]/)[0]
if (base && SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(base)) {
return base
}

return undefined
}
/**
* Find an existing "Hapi Voice Assistant" agent
*/
Expand Down Expand Up @@ -193,5 +235,113 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
}
})

app.post('/voice/transcribe', async (c) => {
const formData = await c.req.formData().catch(() => null)
if (!formData) {
return c.json({ error: 'Invalid form data' }, 400)
}

const file = formData.get('file')
const modelIdRaw = formData.get('modelId')
const languageCodeRaw = formData.get('languageCode')

if (!(file instanceof File)) {
return c.json({ error: 'Missing audio file' }, 400)
}

const modelIdParsed = transcriptionModelSchema.safeParse(
typeof modelIdRaw === 'string' ? modelIdRaw : 'scribe_v2'
)
if (!modelIdParsed.success) {
return c.json({ error: 'Invalid modelId' }, 400)
}

const apiKey = process.env.ELEVENLABS_API_KEY
if (!apiKey) {
return c.json({ error: 'ElevenLabs API key not configured' }, 400)
}

const upstreamFormData = new FormData()
upstreamFormData.set('model_id', modelIdParsed.data)
upstreamFormData.set('file', file, file.name || 'speech.webm')
const languageCode = typeof languageCodeRaw === 'string'
? normalizeTranscriptionLanguageCode(languageCodeRaw)
: undefined
if (languageCode && modelIdParsed.data === 'scribe_v2') {
upstreamFormData.set('language_code', languageCode)
}

try {
const response = await fetch(`${ELEVENLABS_API_BASE}/speech-to-text`, {
method: 'POST',
headers: {
'xi-api-key': apiKey,
'Accept': 'application/json'
},
body: upstreamFormData
})

if (!response.ok) {
const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
const errorMessage = typeof errorData.detail === 'string'
? errorData.detail
: errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
return c.json({ error: errorMessage }, 500)
}

const data = await response.json() as { text?: string; language_code?: string }
return c.json({
text: data.text ?? '',
languageCode: data.language_code
})
} catch (error) {
return c.json({
error: error instanceof Error ? error.message : 'Network error'
}, 500)
}
})

app.post('/voice/scribe-token', async (c) => {
const json = await c.req.json().catch(() => null)
const parsed = scribeTokenRequestSchema.safeParse(json ?? {})
if (!parsed.success) {
return c.json({ error: 'Invalid request body' }, 400)
}

const apiKey = parsed.data.customApiKey || process.env.ELEVENLABS_API_KEY
if (!apiKey) {
return c.json({ error: 'ElevenLabs API key not configured' }, 400)
}

try {
const response = await fetch(`${ELEVENLABS_API_BASE}/single-use-token/realtime_scribe`, {
method: 'POST',
headers: {
'xi-api-key': apiKey,
'Accept': 'application/json'
}
})

if (!response.ok) {
const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
const errorMessage = typeof errorData.detail === 'string'
? errorData.detail
: errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
return c.json({ error: errorMessage }, 500)
}

const data = await response.json() as { token?: string }
if (!data.token) {
return c.json({ error: 'No token in ElevenLabs response' }, 500)
}

return c.json({ token: data.token })
} catch (error) {
return c.json({
error: error instanceof Error ? error.message : 'Network error'
}, 500)
}
})

return app
}
28 changes: 27 additions & 1 deletion web/src/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import type {
SpawnResponse,
UploadFileResponse,
VisibilityPayload,
VoiceScribeTokenResponse,
VoiceTranscriptionResponse,
SessionResponse,
SessionsResponse
} from '@/types/api'
Expand Down Expand Up @@ -94,7 +96,7 @@ export class ApiClient {
if (authToken) {
headers.set('authorization', `Bearer ${authToken}`)
}
if (init?.body !== undefined && !headers.has('content-type')) {
if (init?.body !== undefined && !(init.body instanceof FormData) && !headers.has('content-type')) {
headers.set('content-type', 'application/json')
}

Expand Down Expand Up @@ -443,4 +445,28 @@ export class ApiClient {
body: JSON.stringify(options || {})
})
}

async transcribeVoice(
file: File,
options?: { modelId?: 'scribe_v1' | 'scribe_v2'; languageCode?: string }
): Promise<VoiceTranscriptionResponse> {
const formData = new FormData()
formData.set('file', file)
formData.set('modelId', options?.modelId ?? 'scribe_v2')
if (options?.languageCode) {
formData.set('languageCode', options.languageCode)
}

return await this.request('/api/voice/transcribe', {
method: 'POST',
body: formData
})
}

async fetchVoiceScribeToken(): Promise<VoiceScribeTokenResponse> {
return await this.request('/api/voice/scribe-token', {
method: 'POST',
body: JSON.stringify({})
})
}
}
48 changes: 42 additions & 6 deletions web/src/components/AssistantChat/HappyComposer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
useRef,
useState
} from 'react'
import type { ApiClient } from '@/api/client'
import type { AgentState, CodexCollaborationMode, PermissionMode } from '@/types/api'
import type { Suggestion } from '@/hooks/useActiveSuggestions'
import type { ConversationStatus } from '@/realtime/types'
Expand All @@ -22,6 +23,10 @@ import { usePlatform } from '@/hooks/usePlatform'
import { usePWAInstall } from '@/hooks/usePWAInstall'
import { supportsEffort, supportsModelChange } from '@hapi/protocol'
import { markSkillUsed } from '@/lib/recent-skills'
import { useComposerDraft } from '@/hooks/useComposerDraft'
import { useElevenLabsTranscription } from '@/hooks/useElevenLabsTranscription'
import { useSpeechToText } from '@/hooks/useSpeechToText'
import { useVoiceMode } from '@/hooks/useVoiceMode'
import { FloatingOverlay } from '@/components/ChatInput/FloatingOverlay'
import { Autocomplete } from '@/components/ChatInput/Autocomplete'
import { StatusBar } from '@/components/AssistantChat/StatusBar'
Expand Down Expand Up @@ -64,6 +69,7 @@ export function HappyComposer(props: {
terminalUnsupported?: boolean
autocompletePrefixes?: string[]
autocompleteSuggestions?: (query: string) => Promise<Suggestion[]>
voiceTranscriptionApi?: ApiClient
// Voice assistant props
voiceStatus?: ConversationStatus
voiceMicMuted?: boolean
Expand Down Expand Up @@ -96,6 +102,7 @@ export function HappyComposer(props: {
terminalUnsupported = false,
autocompletePrefixes = ['@', '/', '$'],
autocompleteSuggestions = defaultSuggestionHandler,
voiceTranscriptionApi,
voiceStatus = 'disconnected',
voiceMicMuted = false,
onVoiceToggle,
Expand Down Expand Up @@ -165,6 +172,7 @@ export function HappyComposer(props: {
}, [controlledByUser])

const { haptic: platformHaptic, isTouch } = usePlatform()
const { voiceMode } = useVoiceMode()
const { isStandalone, isIOS } = usePWAInstall()
const isIOSPWA = isIOS && isStandalone
const bottomPaddingClass = isIOSPWA ? 'pb-0' : 'pb-3'
Expand All @@ -185,6 +193,34 @@ export function HappyComposer(props: {
}
}, [platformHaptic])

const dictation = useSpeechToText({
getCurrentText: () => composerText,
onTextChange: (text) => api.composer().setText(text)
})
const elevenLabsDictation = useElevenLabsTranscription({
api: voiceTranscriptionApi ?? null,
getCurrentText: () => composerText,
onTextChange: (text) => api.composer().setText(text)
})

const effectiveVoiceStatus = voiceMode === 'dictation-local'
? dictation.status
: voiceMode === 'dictation-elevenlabs'
? elevenLabsDictation.status
: voiceStatus
const effectiveVoiceEnabled = voiceMode === 'dictation-local'
? dictation.supported
: voiceMode === 'dictation-elevenlabs'
? elevenLabsDictation.supported
: Boolean(onVoiceToggle)
const effectiveVoiceMicMuted = voiceMode === 'assistant' ? voiceMicMuted : false
const effectiveOnVoiceToggle = voiceMode === 'dictation-local'
? dictation.toggle
: voiceMode === 'dictation-elevenlabs'
? elevenLabsDictation.toggle
: onVoiceToggle
const effectiveOnVoiceMicToggle = voiceMode === 'assistant' ? onVoiceMicToggle : undefined

const handleSuggestionSelect = useCallback((index: number) => {
const suggestion = suggestions[index]
if (!suggestion || !textareaRef.current) return
Expand Down Expand Up @@ -483,7 +519,7 @@ export function HappyComposer(props: {
|| showEffortSettings
)
const showAbortButton = true
const voiceEnabled = Boolean(onVoiceToggle)
const voiceEnabled = effectiveVoiceEnabled

const handleSend = useCallback(() => {
api.composer().send()
Expand Down Expand Up @@ -759,7 +795,7 @@ export function HappyComposer(props: {
permissionMode={permissionMode}
collaborationMode={collaborationMode}
agentFlavor={agentFlavor}
voiceStatus={voiceStatus}
voiceStatus={effectiveVoiceStatus}
/>

<div className="overflow-hidden rounded-[20px] bg-[var(--app-secondary-bg)]">
Expand Down Expand Up @@ -804,10 +840,10 @@ export function HappyComposer(props: {
isSwitching={isSwitching}
onSwitch={handleSwitch}
voiceEnabled={voiceEnabled}
voiceStatus={voiceStatus}
voiceMicMuted={voiceMicMuted}
onVoiceToggle={onVoiceToggle ?? (() => {})}
onVoiceMicToggle={onVoiceMicToggle}
voiceStatus={effectiveVoiceStatus}
voiceMicMuted={effectiveVoiceMicMuted}
onVoiceToggle={effectiveOnVoiceToggle ?? (() => {})}
onVoiceMicToggle={effectiveOnVoiceMicToggle}
onSend={handleSend}
/>
</div>
Expand Down
1 change: 1 addition & 0 deletions web/src/components/SessionChat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ export function SessionChat(props: {
onTerminal={props.session.active && terminalSupported ? handleViewTerminal : undefined}
terminalUnsupported={props.session.active && !terminalSupported}
autocompleteSuggestions={props.autocompleteSuggestions}
voiceTranscriptionApi={props.api}
voiceStatus={voice?.status}
voiceMicMuted={voice?.micMuted}
onVoiceToggle={voice ? handleVoiceToggle : undefined}
Expand Down
Loading