Skip to content

Commit c869d8b

Browse files
authored
Merge pull request #1 from HumeAI/twitchard/streaming-and-pauses
Add support for streaming and pauses
2 parents acd2f3f + 555e638 commit c869d8b

File tree

7 files changed

+456
-230
lines changed

7 files changed

+456
-230
lines changed

bun.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
"bun": "^1.2.2",
2727
"clipanion": "^4.0.0-rc.4",
2828
"debug": "^4.4.0",
29-
"hume": "^0.9.12",
29+
"hume": "^0.9.15",
3030
"open": "^10.1.0",
3131
"typanion": "^3.14.0"
3232
},

src/config.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ export type ConfigData = {
2929
lastIndex?: number;
3030
playCommand?: string;
3131
presetVoice?: boolean;
32+
speed?: number;
33+
trailingSilence?: number;
34+
streaming?: boolean;
3235
};
3336
json?: boolean;
3437
pretty?: boolean;
@@ -45,6 +48,9 @@ export const configValidators = {
4548
'tts.format': t.isEnum(['wav', 'mp3', 'pcm'] as const),
4649
'tts.playCommand': t.isString(),
4750
'tts.presetVoice': t.isBoolean(),
51+
'tts.speed': t.cascade(t.isNumber(), t.isInInclusiveRange(0.25, 3.0)),
52+
'tts.trailingSilence': t.cascade(t.isNumber(), t.isInInclusiveRange(0.0, 5.0)),
53+
'tts.streaming': t.isBoolean(),
4854
json: t.isBoolean(),
4955
pretty: t.isBoolean(),
5056
apiKey: t.isString(),
@@ -160,6 +166,10 @@ const parseConfigKV = (name: keyof typeof configValidators, value: string): unkn
160166
validValues = '\nValid values: "all", "first", or "off"';
161167
} else if (name === 'tts.format') {
162168
validValues = '\nValid values: "wav", "mp3", or "pcm"';
169+
} else if (name === 'tts.speed') {
170+
validValues = '\nValid values: number between 0.25 and 3.0';
171+
} else if (name === 'tts.trailingSilence') {
172+
validValues = '\nValid values: number between 0.0 and 5.0';
163173
}
164174

165175
throw new Error(`Invalid value for ${name}: "${value}"${validValues}\n${errors.join('\n')}`);

src/e2e.test.ts

Lines changed: 46 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@ import { mkdir, mkdtemp, rm } from 'fs/promises';
44
import { join } from 'path';
55
import { tmpdir } from 'os';
66
import { existsSync } from 'fs';
7+
import type { Snippet as Snippet_ } from 'hume/serialization/resources/tts/types';
8+
import type { Hume } from 'hume';
9+
10+
type Snippet = Hume.tts.Snippet;
11+
type RawSnippet = Snippet_.Raw;
712

813
// Test utility function for logging during tests
914
// Only logs when BUN_TEST_VERBOSE=1 is set
10-
function log(message: string): void {
15+
function log(...args: any[]): void {
1116
if (process.env.BUN_TEST_VERBOSE === '1') {
12-
console.log(message);
17+
console.log(...args);
1318
}
1419
}
1520

@@ -24,7 +29,7 @@ class TestEnvironment {
2429

2530
async setup() {
2631
await this.server.start();
27-
this.server.setupDefaultTtsHandler();
32+
this.server.setupDefaultTtsStreamHandler();
2833
this.apiUrl = this.server.getBaseUrl();
2934

3035
// Set up test filesystem
@@ -83,7 +88,7 @@ class TestEnvironment {
8388
* Get the TTS API requests specifically
8489
*/
8590
getTtsRequests() {
86-
return this.server.findRequestsTo('/v0/tts');
91+
return this.server.findRequestsTo('/v0/tts/stream/json');
8792
}
8893

8994
/**
@@ -195,20 +200,7 @@ class TestEnvironment {
195200
}
196201

197202
interface MockTtsOptions {
198-
generations?: Array<{
199-
generation_id?: string;
200-
audio?: string;
201-
duration?: number;
202-
file_size?: number;
203-
format?: { type: string };
204-
sample_rate?: number;
205-
encoding?: {
206-
type: string;
207-
format: string;
208-
sample_rate: number;
209-
};
210-
snippets?: any[];
211-
}>;
203+
snippets?: Array<RawSnippet>;
212204
error?: {
213205
status: number;
214206
message: string;
@@ -332,13 +324,13 @@ class MockHumeServer {
332324

333325
configureTtsResponse(options: MockTtsOptions) {
334326
this.ttsOptions = options;
335-
this.setupDefaultTtsHandler();
327+
this.setupDefaultTtsStreamHandler();
336328
}
337329

338330
// Default TTS response handler
339-
setupDefaultTtsHandler() {
331+
setupDefaultTtsStreamHandler() {
340332
// Handle TTS API requests - actual path used by the client
341-
this.addHandler('/v0/tts', async (req) => {
333+
this.addHandler('/v0/tts/stream/json', async (req) => {
342334
try {
343335
const body = await req.json();
344336

@@ -350,34 +342,28 @@ class MockHumeServer {
350342
});
351343
}
352344

353-
// Determine number of generations to return
354345
const numGenerations = body.numGenerations || 1;
355346

356-
// If specific generations are provided in options, use those
357-
let generations;
358-
if (this.ttsOptions.generations && this.ttsOptions.generations.length > 0) {
359-
generations = this.ttsOptions.generations;
347+
let snippets;
348+
if (this.ttsOptions.snippets && this.ttsOptions.snippets.length > 0) {
349+
snippets = this.ttsOptions.snippets;
360350
} else {
361351
// Otherwise create default mock generations
362352
const mockAudio = Buffer.from('mock-audio-data').toString('base64');
363353

364-
generations = Array.from({ length: numGenerations }, (_, i) => ({
354+
snippets = Array.from({ length: numGenerations }, (_, i) => ({
365355
generation_id: `mock_gen_${i + 1}`,
366356
audio: mockAudio,
367-
duration: 1.5, // in seconds
368-
file_size: 100, // in bytes
369-
format: { type: 'wav' },
370-
sample_rate: 44100,
371-
encoding: {
372-
type: 'base64',
373-
format: 'wav',
374-
sample_rate: 44100,
375-
},
376-
snippets: [],
357+
id: `mock_snippet_${i + 1}`,
358+
text: 'mock text',
359+
utteranceIndex: 0,
377360
}));
378361
}
379362

380-
return Response.json({ generations });
363+
return new Response(snippets!.map((x) => JSON.stringify(x) + '\n').join(''), {
364+
status: 200,
365+
headers: { 'Content-Type': 'text-plain; charset=utf-8' },
366+
});
381367
} catch (error) {
382368
log(`Error in mock handler: ${error}`);
383369
return new Response(JSON.stringify({ error: 'Internal server error' }), {
@@ -467,24 +453,17 @@ describe('CLI End-to-End Tests', () => {
467453

468454
// Helper functions
469455
// Use NonNullable to ensure TypeScript knows we're accessing a valid type
470-
const createGeneration = (
471-
id: string,
472-
options: Partial<NonNullable<MockTtsOptions['generations']>[0]> = {}
473-
) => ({
474-
generation_id: id,
475-
audio: Buffer.from(`audio-data-${id}`).toString('base64'),
476-
duration: 1.5,
477-
file_size: 200,
478-
format: { type: 'wav' },
479-
sample_rate: 44100,
480-
encoding: {
481-
type: 'base64',
482-
format: 'wav',
483-
sample_rate: 44100,
484-
},
485-
snippets: [],
486-
...options,
487-
});
456+
const createSnippet = (partial: Partial<Snippet>): RawSnippet => {
457+
const generationId = partial.generationId ?? 'test_gen_123';
458+
const id = partial.id ?? `${generationId}-0`;
459+
return {
460+
generation_id: generationId,
461+
id,
462+
audio: Buffer.from(`audio-data-${generationId}-${id}`).toString('base64'),
463+
text: partial.text ?? 'test text',
464+
utterance_index: partial.utteranceIndex ?? 0,
465+
};
466+
};
488467

489468
// Helper to check common test failure details
490469
const logFailureDetails = (result: { exitCode: number; stdout: string; stderr: string }) => {
@@ -511,7 +490,7 @@ describe('CLI End-to-End Tests', () => {
511490
test('Basic text-to-speech with description', async () => {
512491
// Configure a custom response
513492
testEnv.configureTtsResponse({
514-
generations: [createGeneration('test_gen_123', { duration: 2.5 })],
493+
snippets: [createSnippet({ generationId: 'test_gen_123' })],
515494
});
516495

517496
const outputDir = await testEnv.createOutputDir('tts-output');
@@ -545,22 +524,10 @@ describe('CLI End-to-End Tests', () => {
545524
test('Multiple generations with specific format', async () => {
546525
// Configure a custom response with multiple generations
547526
testEnv.configureTtsResponse({
548-
generations: [
549-
createGeneration('multi_gen_1', {
550-
format: { type: 'mp3' },
551-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
552-
duration: 1.0,
553-
}),
554-
createGeneration('multi_gen_2', {
555-
format: { type: 'mp3' },
556-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
557-
duration: 1.2,
558-
}),
559-
createGeneration('multi_gen_3', {
560-
format: { type: 'mp3' },
561-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
562-
duration: 1.3,
563-
}),
527+
snippets: [
528+
createSnippet({ generationId: 'multi_gen_1' }),
529+
createSnippet({ generationId: 'multi_gen_2' }),
530+
createSnippet({ generationId: 'multi_gen_3' }),
564531
],
565532
});
566533

@@ -602,7 +569,7 @@ describe('CLI End-to-End Tests', () => {
602569
test('Reading from stdin', async () => {
603570
// Configure a custom response
604571
testEnv.configureTtsResponse({
605-
generations: [createGeneration('stdin_gen_123', { duration: 1.8 })],
572+
snippets: [createSnippet({ generationId: 'stdin_gen_123' })],
606573
});
607574

608575
const inputText = 'This is text from standard input';
@@ -716,19 +683,10 @@ describe('CLI End-to-End Tests', () => {
716683

717684
// Configure the TTS responses for first call with 3 generations
718685
testEnv.configureTtsResponse({
719-
generations: [
720-
createGeneration('config_test_gen_1', {
721-
format: { type: 'mp3' },
722-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
723-
}),
724-
createGeneration('config_test_gen_2', {
725-
format: { type: 'mp3' },
726-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
727-
}),
728-
createGeneration('config_test_gen_3', {
729-
format: { type: 'mp3' },
730-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
731-
}),
686+
snippets: [
687+
createSnippet({ generationId: 'config_test_gen_1' }),
688+
createSnippet({ generationId: 'config_test_gen_2' }),
689+
createSnippet({ generationId: 'config_test_gen_3' }),
732690
],
733691
});
734692

@@ -771,12 +729,7 @@ describe('CLI End-to-End Tests', () => {
771729

772730
// Configure the TTS response for continuation
773731
testEnv.configureTtsResponse({
774-
generations: [
775-
createGeneration('continuation_gen_1', {
776-
format: { type: 'mp3' },
777-
encoding: { type: 'base64', format: 'mp3', sample_rate: 44100 },
778-
}),
779-
],
732+
snippets: [createSnippet({ generationId: 'continuation_gen_1' })],
780733
});
781734

782735
// Step 4: Run TTS with continuation using --last and --last-index

src/index.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ const usageDescriptions = {
2121
'tts.playCommand': 'Command to play audio files (uses $AUDIO_FILE as placeholder for file path)',
2222
'tts.format': 'Output audio format',
2323
'tts.presetVoice': "Required to use one of Hume's provided voices",
24+
'tts.speed': 'Speaking speed multiplier (0.25-3.0, default is 1.0)',
25+
'tts.trailingSilence': 'Seconds of silence to add at the end (0.0-5.0, default is 0.35)',
26+
'tts.streaming': 'Use streaming mode for TTS generation (default: true)',
2427
apiKey: 'Override the default API key',
2528
json: 'Output in JSON format',
2629
pretty: 'Output in human-readable format',
@@ -266,6 +269,8 @@ const ttsExamples: Usage['examples'] = [
266269
'Setting a custom audio player for the session',
267270
'hume session set tts.playCommand "vlc $AUDIO_FILE --play-and-exit"',
268271
],
272+
['Adjusting speech speed', '$0 tts "I am speaking very slowly" -v narrator --speed 0.75'],
273+
['Adding trailing silence', '$0 tts "Wait for it..." -v narrator --trailing-silence 3.5'],
269274
];
270275
class TtsCommand extends Command {
271276
static paths = [['tts']];
@@ -339,6 +344,20 @@ class TtsCommand extends Command {
339344
description: usageDescriptions['tts.presetVoice'],
340345
});
341346

347+
speed = Option.String('--speed', {
348+
validator: t.cascade(t.isNumber(), t.isInInclusiveRange(0.25, 3.0)),
349+
description: usageDescriptions['tts.speed'],
350+
});
351+
352+
trailingSilence = Option.String('--trailing-silence', {
353+
validator: t.cascade(t.isNumber(), t.isInInclusiveRange(0.0, 5.0)),
354+
description: usageDescriptions['tts.trailingSilence'],
355+
});
356+
357+
streaming = Option.Boolean('--streaming', {
358+
description: usageDescriptions['tts.streaming'],
359+
});
360+
342361
async execute() {
343362
const tts = new Tts();
344363
await tts.synthesize(this);

0 commit comments

Comments
 (0)