fix

michalkulakowski · michalkulakowski · commit af43ae75a49b · 2026-01-12T13:30:39.000+01:00
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -47,6 +47,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh
 
 The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations.
 
+### Speaker embeddings
+
+Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
+```bash
+curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
+python create_speaker_embedding.py
+mv speaker_embedding.bin models/
+```
+Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding.
+Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
+```
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+  name: "T2sExecutor"
+  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
+  calculator: "T2sCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+    [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
+      models_path: "./",
+      plugin_config: '{ "NUM_STREAMS": "1" }',
+      target_device: "CPU",
+      voices: [
+        {
+          name: "voice",
+          path: "/models/speaker_embedding.bin",
+        }
+      ]
+    }
+  }
+}
+```
+
 ### Deployment
 
 **CPU**
diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc
@@ -124,16 +124,27 @@ class T2sCalculator : public CalculatorBase {
             if (streamIt != payload.parsedJson->MemberEnd()) {
                 return absl::InvalidArgumentError("streaming is not supported");
             }
+            std::optional<std::string> voice;
+            auto voiceIt = payload.parsedJson->FindMember("voice");
+            if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                voice = voiceIt->value.GetString();
+            }
+            std::string voiceEmbeddingsPath;
+            if(voice.has_value()){
+                if (pipe->voices.find(voice.value()) == pipe->voices.end())
+                    return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri));
+                if (!std::filesystem::exists(pipe->voices[voice.value()]))
+                    return absl::InvalidArgumentError(absl::StrCat("Requested voice speaker embeddings file does not exist: ", pipe->voices[voice.value()]));
+                voiceEmbeddingsPath = pipe->voices[voice.value()];
+            }
             ov::genai::Text2SpeechDecodedResults generatedSpeech;
-            std::string voiceEmbeddingsPath = std::string(pipe->parsedModelsPath.c_str()) + std::string("speaker_embedding.bin");
             std::unique_lock lock(pipe->ttsPipelineMutex);
-            if(std::filesystem::exists(voiceEmbeddingsPath)){
-                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings file found");
+
+            if(voice.has_value()){
                 auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath);
                 generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding);
             }
             else{
-                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings not found");
                 generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
             }
             auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto
@@ -31,4 +31,13 @@ message T2sCalculatorOptions {
     required string models_path = 1;
     optional string target_device = 2;
     optional string plugin_config = 3;
+
+    message SpeakerEmbeddings {
+      // Speaker name.
+      required string name = 1;
+
+      // Path to speaker embeddings file.
+      required string path = 2;
+    }
+    repeated SpeakerEmbeddings voices = 4;
 }
diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp
@@ -42,9 +42,10 @@ struct TtsServable {
     std::filesystem::path parsedModelsPath;
     std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
     std::mutex ttsPipelineMutex;
+    std::unordered_map<std::string, std::string> voices;
 
-    TtsServable(const mediapipe::T2sCalculatorOptions& nodeOptions, const std::string& graphPath) {
-        auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
+    TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& graphPath) {
+        auto fsModelsPath = std::filesystem::path(modelDir);
         if (fsModelsPath.is_relative()) {
             parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
         } else {
@@ -57,6 +58,9 @@ struct TtsServable {
             throw std::runtime_error("Error during plugin_config option parsing");
         }
         ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), nodeOptions.target_device(), config);
+        for(auto voice : graphVoices){
+            voices[voice.name()] = voice.path();
+        }
     }
 };
 
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -616,7 +616,7 @@ Status MediapipeGraphDefinition::initializeNodes() {
                 SPDLOG_LOGGER_ERROR(modelmanager_logger, "Failed to unpack calculator options");
                 return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
             }
-            std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions, mgconfig.getBasePath());
+            std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath());
             ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
             ttsServablesCleaningGuard.disableCleaning();
         }

Original file line number	Diff line number	Diff line change
`@@ -616,7 +616,7 @@ Status MediapipeGraphDefinition::initializeNodes() {`
`616`	`616`	`SPDLOG_LOGGER_ERROR(modelmanager_logger, "Failed to unpack calculator options");`
`617`	`617`	`return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;`
`618`	`618`	`}`
`619`		`- std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions, mgconfig.getBasePath());`
	`619`	`+ std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath());`
`620`	`620`	`ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));`
`621`	`621`	`ttsServablesCleaningGuard.disableCleaning();`
`622`	`622`	`}`