Skip to content

Commit af43ae7

Browse files
fix
1 parent cd7e557 commit af43ae7

File tree

5 files changed

+66
-7
lines changed

5 files changed

+66
-7
lines changed

demos/audio/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh
4747
4848
The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations.
4949

50+
### Speaker embeddings
51+
52+
Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
53+
```bash
54+
curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
55+
python create_speaker_embedding.py
56+
mv speaker_embedding.bin models/
57+
```
58+
Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding.
59+
Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
60+
```
61+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
62+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
63+
node {
64+
name: "T2sExecutor"
65+
input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
66+
calculator: "T2sCalculator"
67+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
68+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
69+
node_options: {
70+
[type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
71+
models_path: "./",
72+
plugin_config: '{ "NUM_STREAMS": "1" }',
73+
target_device: "CPU",
74+
voices: [
75+
{
76+
name: "voice",
77+
path: "/models/speaker_embedding.bin",
78+
}
79+
]
80+
}
81+
}
82+
}
83+
```
84+
5085
### Deployment
5186

5287
**CPU**

src/audio/text_to_speech/t2s_calculator.cc

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,16 +124,27 @@ class T2sCalculator : public CalculatorBase {
124124
if (streamIt != payload.parsedJson->MemberEnd()) {
125125
return absl::InvalidArgumentError("streaming is not supported");
126126
}
127+
std::optional<std::string> voice;
128+
auto voiceIt = payload.parsedJson->FindMember("voice");
129+
if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
130+
voice = voiceIt->value.GetString();
131+
}
132+
std::string voiceEmbeddingsPath;
133+
if(voice.has_value()){
134+
if (pipe->voices.find(voice.value()) == pipe->voices.end())
135+
return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri));
136+
if (!std::filesystem::exists(pipe->voices[voice.value()]))
137+
return absl::InvalidArgumentError(absl::StrCat("Requested voice speaker embeddings file does not exist: ", pipe->voices[voice.value()]));
138+
voiceEmbeddingsPath = pipe->voices[voice.value()];
139+
}
127140
ov::genai::Text2SpeechDecodedResults generatedSpeech;
128-
std::string voiceEmbeddingsPath = std::string(pipe->parsedModelsPath.c_str()) + std::string("speaker_embedding.bin");
129141
std::unique_lock lock(pipe->ttsPipelineMutex);
130-
if(std::filesystem::exists(voiceEmbeddingsPath)){
131-
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings file found");
142+
143+
if(voice.has_value()){
132144
auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath);
133145
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding);
134146
}
135147
else{
136-
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings not found");
137148
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
138149
}
139150
auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();

src/audio/text_to_speech/t2s_calculator.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,13 @@ message T2sCalculatorOptions {
3131
required string models_path = 1;
3232
optional string target_device = 2;
3333
optional string plugin_config = 3;
34+
35+
message SpeakerEmbeddings {
36+
// Speaker name.
37+
required string name = 1;
38+
39+
// Path to speaker embeddings file.
40+
required string path = 2;
41+
}
42+
repeated SpeakerEmbeddings voices = 4;
3443
}

src/audio/text_to_speech/t2s_servable.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ struct TtsServable {
4242
std::filesystem::path parsedModelsPath;
4343
std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
4444
std::mutex ttsPipelineMutex;
45+
std::unordered_map<std::string, std::string> voices;
4546

46-
TtsServable(const mediapipe::T2sCalculatorOptions& nodeOptions, const std::string& graphPath) {
47-
auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
47+
TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& graphPath) {
48+
auto fsModelsPath = std::filesystem::path(modelDir);
4849
if (fsModelsPath.is_relative()) {
4950
parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
5051
} else {
@@ -57,6 +58,9 @@ struct TtsServable {
5758
throw std::runtime_error("Error during plugin_config option parsing");
5859
}
5960
ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), nodeOptions.target_device(), config);
61+
for(auto voice : graphVoices){
62+
voices[voice.name()] = voice.path();
63+
}
6064
}
6165
};
6266

src/mediapipe_internal/mediapipegraphdefinition.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ Status MediapipeGraphDefinition::initializeNodes() {
616616
SPDLOG_LOGGER_ERROR(modelmanager_logger, "Failed to unpack calculator options");
617617
return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
618618
}
619-
std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions, mgconfig.getBasePath());
619+
std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath());
620620
ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
621621
ttsServablesCleaningGuard.disableCleaning();
622622
}

0 commit comments

Comments
 (0)