Skip to content

Commit 4f45830

Browse files
fix
1 parent af43ae7 commit 4f45830

File tree

2 files changed

+26
-29
lines changed

2 files changed

+26
-29
lines changed

src/audio/text_to_speech/t2s_calculator.cc

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -51,27 +51,6 @@ namespace mediapipe {
5151

5252
const std::string TTS_SESSION_SIDE_PACKET_TAG = "TTS_NODE_RESOURCES";
5353

54-
ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
55-
std::ifstream input(file_path, std::ios::binary);
56-
OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string());
57-
58-
// Get file size
59-
input.seekg(0, std::ios::end);
60-
size_t buffer_size = static_cast<size_t>(input.tellg());
61-
input.seekg(0, std::ios::beg);
62-
63-
// Check size is multiple of float
64-
OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size.");
65-
size_t num_floats = buffer_size / sizeof(float);
66-
OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats.");
67-
68-
OPENVINO_ASSERT(input, "Failed to read all data from file.");
69-
ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
70-
input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
71-
72-
return floats_tensor;
73-
}
74-
7554
class T2sCalculator : public CalculatorBase {
7655
static const std::string INPUT_TAG_NAME;
7756
static const std::string OUTPUT_TAG_NAME;
@@ -129,20 +108,15 @@ class T2sCalculator : public CalculatorBase {
129108
if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
130109
voice = voiceIt->value.GetString();
131110
}
132-
std::string voiceEmbeddingsPath;
133111
if(voice.has_value()){
134112
if (pipe->voices.find(voice.value()) == pipe->voices.end())
135113
return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri));
136-
if (!std::filesystem::exists(pipe->voices[voice.value()]))
137-
return absl::InvalidArgumentError(absl::StrCat("Requested voice speaker embeddings file does not exist: ", pipe->voices[voice.value()]));
138-
voiceEmbeddingsPath = pipe->voices[voice.value()];
139114
}
140115
ov::genai::Text2SpeechDecodedResults generatedSpeech;
141116
std::unique_lock lock(pipe->ttsPipelineMutex);
142117

143118
if(voice.has_value()){
144-
auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath);
145-
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding);
119+
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voice.value()]);
146120
}
147121
else{
148122
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());

src/audio/text_to_speech/t2s_servable.hpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,32 @@
3838

3939
namespace ovms {
4040

41+
static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
42+
std::ifstream input(file_path, std::ios::binary);
43+
OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string());
44+
45+
// Get file size
46+
input.seekg(0, std::ios::end);
47+
size_t buffer_size = static_cast<size_t>(input.tellg());
48+
input.seekg(0, std::ios::beg);
49+
50+
// Check size is multiple of float
51+
OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size.");
52+
size_t num_floats = buffer_size / sizeof(float);
53+
OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats.");
54+
55+
OPENVINO_ASSERT(input, "Failed to read all data from file.");
56+
ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
57+
input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
58+
59+
return floats_tensor;
60+
}
61+
4162
struct TtsServable {
4263
std::filesystem::path parsedModelsPath;
4364
std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
4465
std::mutex ttsPipelineMutex;
45-
std::unordered_map<std::string, std::string> voices;
66+
std::unordered_map<std::string, ov::Tensor> voices;
4667

4768
TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& graphPath) {
4869
auto fsModelsPath = std::filesystem::path(modelDir);
@@ -59,7 +80,9 @@ struct TtsServable {
5980
}
6081
ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), nodeOptions.target_device(), config);
6182
for(auto voice : graphVoices){
62-
voices[voice.name()] = voice.path();
83+
if (!std::filesystem::exists(voice.path()))
84+
throw std::runtime_error{"Requested voice speaker embeddings file does not exist."};
85+
voices[voice.name()] = read_speaker_embedding(voice.path());
6386
}
6487
}
6588
};

0 commit comments

Comments
 (0)