Skip to content

Commit def35d4

Browse files
authored
Merge pull request #603 from oreomaker/qnn-aot
feat(qualcomm): Qnn aot runner
2 parents ff5144d + 34895a5 commit def35d4

29 files changed

+1214
-140
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ if(MLLM_TRACY_ENABLE)
1717
add_subdirectory(tracy_example)
1818
endif()
1919

20-
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
20+
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
2121
add_subdirectory(qwen3_qnn_aot)
2222
endif()
Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1-
add_executable(mllm-qwen3-aot-c compile.cpp)
2-
target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
3-
target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
1+
# AOT targets run on x86
2+
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
3+
add_executable(mllm-qwen3-aot-c compile.cpp)
4+
target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
5+
target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
6+
endif()
7+
8+
add_executable(mllm-qwen3-aot-runner aot_run.cpp)
9+
target_link_libraries(mllm-qwen3-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
10+
target_include_directories(mllm-qwen3-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})

examples/qwen3_qnn_aot/aot_run.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include <iostream>
2+
#include <fmt/core.h>
3+
#include <mllm/mllm.hpp>
4+
#include <string>
5+
#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
6+
#include "mllm/models/qwen3/configuration_qwen3.hpp"
7+
#include "mllm/models/qwen3/tokenization_qwen3.hpp"
8+
9+
using mllm::Argparse;
10+
using namespace mllm::qnn::aot; // NOLINT
11+
12+
MLLM_MAIN({
13+
auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
14+
auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
15+
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
16+
auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
17+
auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
18+
auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
19+
20+
Argparse::parse(argc, argv);
21+
22+
mllm::initQnnBackend(model_path.get());
23+
24+
if (help.isSet()) {
25+
Argparse::printHelp();
26+
return 0;
27+
}
28+
29+
auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());
30+
31+
RunnerConfig config;
32+
config.model_path = model_path.get();
33+
config.temperature = temperature.get();
34+
config.num_layers = qwen3_cfg.num_hidden_layers;
35+
config.num_heads = qwen3_cfg.num_attention_heads;
36+
config.head_dim = qwen3_cfg.head_dim;
37+
config.vocab_size = qwen3_cfg.vocab_size;
38+
config.context_len = 1024;
39+
config.ar_len = ar_len.get();
40+
41+
auto tokenizer = mllm::models::qwen3::Qwen3Tokenizer(tokenizer_path.get());
42+
43+
std::string prompt_text;
44+
fmt::print("💬 Prompt text (or 'exit/quit'): ");
45+
std::getline(std::cin, prompt_text);
46+
47+
auto input_tensor = tokenizer.convertMessage({.prompt = prompt_text});
48+
49+
Runner runner(config, &tokenizer);
50+
if (!runner.load()) {
51+
std::cerr << "Failed to load model\n";
52+
return 1;
53+
}
54+
55+
std::vector<uint64_t> prompt_tokens;
56+
auto sequence = input_tensor["sequence"];
57+
int64_t* ptr = sequence.ptr<int64_t>();
58+
for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }
59+
60+
runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
61+
std::cout << "\n";
62+
63+
return 0;
64+
});

examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ class Qwen3Attention final : public nn::Module {
242242
"k_rope_add_0_output_qdq");
243243

244244
// De-quantization and quantization again
245-
key_states = key_states.to(kFloat16);
245+
key_states = key_states.to(kFloat32);
246246
key_states = key_states.to(kUInt8PerTensorSym);
247247
key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq");
248248

@@ -251,7 +251,7 @@ class Qwen3Attention final : public nn::Module {
251251

252252
// Handle KV Cache
253253
value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq");
254-
value_states = value_states.to(kFloat16);
254+
value_states = value_states.to(kFloat32);
255255
value_states = value_states.to(kUInt8PerTensorSym);
256256
value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq");
257257

mllm/backends/qnn/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
2121
list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC})
2222
endif()
2323

24+
file(GLOB_RECURSE MLLM_QUALCOMM_AOT_RT_SRC
25+
${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.hpp
26+
${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.cpp
27+
)
28+
list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_RT_SRC})
29+
2430
add_library(
2531
MllmQNNBackend
2632
SHARED

mllm/backends/qnn/QNNBackend.cpp

Lines changed: 24 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
5555
MLLM_INFO("QNN backend supports early termination");
5656
}
5757

58-
bool contextStatus = false;
59-
// check if the qnn_context.bin file exists
60-
if (!std::filesystem::exists("qnn_context.bin")) {
61-
contextStatus = runtime_->createContext(context_, nullptr);
62-
} else {
63-
contextStatus = runtime_->retrieveContext(context_, qnnModels_, nullptr);
64-
65-
// fill qnnModelIndexMap_ info according to qnnModels_
66-
for (size_t i = 0; i < qnnModels_.size(); i++) {
67-
auto graphName = qnnModels_[i]->getQnnGraphName();
68-
qnnModelIndexMap_.insert(std::make_pair(graphName, i));
69-
}
70-
}
71-
if (!contextStatus) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
72-
73-
// init QNN Allocator
74-
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
75-
7658
// set performance parameters for better performance on HTP
7759
perf_ = QNNPerf::create(&runtime_->qnnInterface);
7860
perf_->setPowerConfigBurst();
@@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t
348330
return true;
349331
}
350332

351-
bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
352-
QnnContext_Config_t** contextConfig) {
333+
bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
334+
std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig) {
353335
// Read the binary from qnn_context.bin and get the size in byte
354-
std::ifstream file(QNN_Context_File, std::ios::binary | std::ios::ate);
336+
std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate);
355337
std::streamsize size = file.tellg();
356338
file.seekg(0, std::ios::beg);
357339

@@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::
436418
return true;
437419
}
438420

421+
bool QNNBackend::createContext() {
422+
if (!runtime_->createContext(context_, nullptr)) { return false; }
423+
// init QNN Allocator
424+
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
425+
return true;
426+
}
427+
428+
bool QNNBackend::loadContext(const std::string& contextPath) {
429+
if (!runtime_->retrieveContext(contextPath, context_, qnnModels_, nullptr)) { return false; }
430+
// fill qnnModelIndexMap_ info according to qnnModels_
431+
for (size_t i = 0; i < qnnModels_.size(); i++) {
432+
auto graphName = qnnModels_[i]->getQnnGraphName();
433+
qnnModelIndexMap_.insert(std::make_pair(graphName, i));
434+
}
435+
// init QNN Allocator
436+
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
437+
return true;
438+
}
439+
439440
std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
440441
// If the graph already exists, return the existing model
441442
if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
@@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
535536
return;
536537
}
537538

538-
// Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers
539-
// This handles the case where input tensor sizes may differ between prefill and decode phases
540539
std::vector<Qnn_Tensor_t> qnn_inputs;
541540
std::vector<Qnn_Tensor_t> qnn_outputs;
542541
for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
@@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
550549
return;
551550
}
552551

553-
if (wrapper_tensor.isNil()) {
554-
MLLM_ERROR("Graph input wrapper {} for graph '{}' has no backing tensor", i, graphName);
555-
return;
556-
}
557-
558-
// Check for size mismatches (can occur in decode phase where inputs may be smaller)
559-
size_t dst_bytes = wrapper_tensor.bytes();
560-
size_t src_bytes = runtime_input.bytes();
561-
if (dst_bytes != src_bytes) {
562-
MLLM_WARN("Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying "
563-
"min(dst, src), but this may truncate data.",
564-
graphName, i, dst_bytes, src_bytes);
565-
}
566-
567-
if (dst_bytes > 0) {
568-
void* dst_ptr = wrapper_tensor.ptr<void>();
569-
if (!dst_ptr) {
570-
wrapper_tensor.alloc();
571-
dst_ptr = wrapper_tensor.ptr<void>();
572-
}
573-
574-
const void* src_ptr = runtime_input.ptr<void>();
575-
size_t bytes_to_copy = std::min(dst_bytes, src_bytes);
576-
if (!src_ptr) {
577-
MLLM_ERROR("Runtime input tensor {} for graph '{}' has null data pointer", i, graphName);
578-
return;
579-
}
580-
if (dst_ptr && src_ptr && dst_ptr != src_ptr) {
581-
// Copy source data to destination buffer
582-
// This ensures that the graph input wrapper has the correct data for execution
583-
if (bytes_to_copy > 0) { std::memcpy(dst_ptr, src_ptr, bytes_to_copy); }
584-
585-
// If source is smaller than destination, zero out the remaining bytes
586-
// This is important for decode phase where input tensors may be smaller than prefill
587-
// For example, decode phase may use [1, 1] input while wrapper expects [1, 128]
588-
// Note: In current implementation with full [1, 128] tensor, this should not trigger
589-
// but it's kept as a safety measure for future optimizations
590-
if (src_bytes < dst_bytes) {
591-
size_t remaining_bytes = dst_bytes - src_bytes;
592-
std::memset(static_cast<char*>(dst_ptr) + bytes_to_copy, 0, remaining_bytes);
593-
// Only log if zero-padding actually occurs (unexpected case)
594-
MLLM_WARN("[QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)",
595-
graphName, i, remaining_bytes, src_bytes, dst_bytes);
596-
}
597-
}
598-
}
552+
// input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
553+
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
599554

600555
// Allocate and register the wrapper tensor with QNN allocator
601556
// QNNAllocator will handle registered memory descriptor when needed
@@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
617572

618573
if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }
619574

620-
// Debug: Print last output shape from QNN actual return order (before reordering)
621-
// Uncomment below for debugging output order issues
622-
// if (!qnn_output_tensors.empty()) {
623-
// const auto& last_output = qnn_output_tensors.back();
624-
// const auto& output_wrappers = model->getGraphOutputTensorWrappers();
625-
// const auto& last_wrapper = output_wrappers.back();
626-
// MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}",
627-
// last_wrapper->getName(), last_output.shape());
628-
// }
629-
630575
// Reorder outputs according to MLLM expected order
631576
const auto& expectedOrder = model->getExpectedOutputOrder();
632577

633578
// Resize outputs to match QNN output count first
634579
outputs.resize(qnn_output_tensors.size()); // Ensure outputs has enough space for all QNN outputs
635580
if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
636-
// Debug: Log output order information
637-
// Uncomment below for debugging output order issues
638-
// MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName);
639-
// MLLM_INFO(" MLLM Expected Output Order ({} outputs):", expectedOrder.size());
640-
// for (size_t i = 0; i < expectedOrder.size(); i++) {
641-
// MLLM_INFO(" [{}] {}", i, expectedOrder[i]);
642-
// }
643-
// MLLM_INFO(" QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size());
644-
// for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) {
645-
// auto wrapper = model->getGraphOutputTensorWrappers()[i];
646-
// MLLM_INFO(" [{}] {}", i, wrapper->getName());
647-
// }
648-
649-
// Check if reordering is needed
650-
// bool needs_reordering = false;
651-
// std::vector<std::pair<size_t, int>> mismatches;
652-
// for (size_t i = 0; i < expectedOrder.size(); i++) {
653-
// const std::string& expected_name = expectedOrder[i];
654-
// int qnn_index = model->getQnnOutputIndex(expected_name);
655-
// if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
656-
// if (static_cast<int>(i) != qnn_index) {
657-
// needs_reordering = true;
658-
// mismatches.emplace_back(i, qnn_index);
659-
// }
660-
// }
661-
// }
662-
663-
// Debug: Verification messages
664-
// Uncomment below for debugging output order issues
665-
// if (needs_reordering) {
666-
// MLLM_INFO(" [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED");
667-
// for (const auto& [mllm_idx, qnn_idx] : mismatches) {
668-
// MLLM_INFO(" Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]",
669-
// mllm_idx, expectedOrder[mllm_idx], qnn_idx);
670-
// }
671-
// } else {
672-
// MLLM_INFO(" [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed");
673-
// }
674-
675581
// Reorder outputs according to expected order
676582
for (size_t i = 0; i < expectedOrder.size(); i++) {
677583
const std::string& expected_name = expectedOrder[i];
678584
int qnn_index = model->getQnnOutputIndex(expected_name);
679585
if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
680586
outputs[i] = qnn_output_tensors[qnn_index];
681-
// Debug: Mapping information
682-
// Uncomment below for debugging output order issues
683-
// if (static_cast<int>(i) != qnn_index) {
684-
// MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name);
685-
// } else {
686-
// MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name);
687-
// }
688587
} else {
689588
MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
690589
graphName);

mllm/backends/qnn/QNNBackend.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class QNNRuntime {
5050
}
5151

5252
bool createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t** contextConfig = nullptr);
53-
bool retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
54-
QnnContext_Config_t** contextConfig = nullptr);
53+
bool retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
54+
std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig = nullptr);
5555

5656
private:
5757
QNN_INTERFACE_VER_TYPE qnnInterface;
@@ -87,6 +87,9 @@ class QNNBackend final : public Backend {
8787
public:
8888
QNNBackend();
8989

90+
bool loadContext(const std::string& contextPath);
91+
bool createContext();
92+
9093
bool isWeightOnDevice() override { return false; }
9194

9295
// QNN Graph build interfaces

mllm/backends/qnn/QNNUtils.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -483,10 +483,7 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
483483
}
484484

485485
void QNNTensorWrapper::alloc() {
486-
if (isAlloc_) {
487-
MLLM_WARN("Tensor {} has already been allocated.", name_);
488-
return;
489-
}
486+
if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
490487
MLLM_RT_ASSERT(dataContainer_.device() == kQNN);
491488

492489
// if storage is not allocated, allocate it

mllm/backends/qnn/QNNUtils.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,13 @@ class QNNTensorWrapper {
205205
Tensor& getDataContainer() { return dataContainer_; }
206206
const std::vector<uint32_t>* getDimension() { return &dimensions_; }
207207

208+
bool isAlloc() { return isAlloc_; }
209+
void __setDataContainer(const Tensor& tensor) {
210+
MLLM_RT_ASSERT(dataContainer_.isNil())
211+
dataContainer_ = tensor;
212+
if (!tensor.isNil()) { isAlloc_ = true; }
213+
}
214+
208215
// Helper to set complex quantization params and manage memory
209216
void setScaleOffsetQuantization(const std::vector<Qnn_ScaleOffset_t>& scaleOffsets, int32_t axis);
210217
void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector<Qnn_ScaleOffset_t>& scaleOffsets);

mllm/backends/qnn/Register.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Licensed under the MIT License.
33

44
#include <memory>
5+
#include <filesystem>
56
#include "mllm/core/BaseOp.hpp"
67
#include "mllm/core/DeviceTypes.hpp"
78
#include "mllm/engine/Context.hpp"
@@ -13,12 +14,17 @@
1314
namespace mllm {
1415

1516
// export initQnnBackend function to initialize QNN backend
16-
void initQnnBackend() {
17+
void initQnnBackend(const std::string& context_path) {
1718
MLLM_RT_ASSERT(isQnnAvailable());
1819
auto& ctx = Context::instance();
1920

2021
// 1. Register backend
2122
auto backend = std::make_shared<qnn::QNNBackend>();
23+
if (std::filesystem::exists(context_path)) {
24+
if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); }
25+
} else {
26+
if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
27+
}
2228
ctx.registerBackend(backend);
2329

2430
// 2. Initialize memory manager

0 commit comments

Comments
 (0)