@@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
5555 MLLM_INFO (" QNN backend supports early termination" );
5656 }
5757
58- bool contextStatus = false ;
59- // check if the qnn_context.bin file exists
60- if (!std::filesystem::exists (" qnn_context.bin" )) {
61- contextStatus = runtime_->createContext (context_, nullptr );
62- } else {
63- contextStatus = runtime_->retrieveContext (context_, qnnModels_, nullptr );
64-
65- // fill qnnModelIndexMap_ info according to qnnModels_
66- for (size_t i = 0 ; i < qnnModels_.size (); i++) {
67- auto graphName = qnnModels_[i]->getQnnGraphName ();
68- qnnModelIndexMap_.insert (std::make_pair (graphName, i));
69- }
70- }
71- if (!contextStatus) { MLLM_ERROR_EXIT (1 , " Failed to create QNN context" ); }
72-
73- // init QNN Allocator
74- static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer (runtime_->qnnInterface , context_);
75-
7658 // set performance parameters for better performance on HTP
7759 perf_ = QNNPerf::create (&runtime_->qnnInterface );
7860 perf_->setPowerConfigBurst ();
@@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t
348330 return true ;
349331}
350332
351- bool QNNRuntime::retrieveContext (Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels ,
352- QnnContext_Config_t** contextConfig) {
333+ bool QNNRuntime::retrieveContext (const std::string& contextBinaryPath, Qnn_ContextHandle_t& context ,
334+ std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig) {
353335 // Read the binary from qnn_context.bin and get the size in byte
354- std::ifstream file (QNN_Context_File , std::ios::binary | std::ios::ate);
336+ std::ifstream file (contextBinaryPath , std::ios::binary | std::ios::ate);
355337 std::streamsize size = file.tellg ();
356338 file.seekg (0 , std::ios::beg);
357339
@@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::
436418 return true ;
437419}
438420
421+ bool QNNBackend::createContext () {
422+ if (!runtime_->createContext (context_, nullptr )) { return false ; }
423+ // init QNN Allocator
424+ static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer (runtime_->qnnInterface , context_);
425+ return true ;
426+ }
427+
428+ bool QNNBackend::loadContext (const std::string& contextPath) {
429+ if (!runtime_->retrieveContext (contextPath, context_, qnnModels_, nullptr )) { return false ; }
430+ // fill qnnModelIndexMap_ info according to qnnModels_
431+ for (size_t i = 0 ; i < qnnModels_.size (); i++) {
432+ auto graphName = qnnModels_[i]->getQnnGraphName ();
433+ qnnModelIndexMap_.insert (std::make_pair (graphName, i));
434+ }
435+ // init QNN Allocator
436+ static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer (runtime_->qnnInterface , context_);
437+ return true ;
438+ }
439+
439440std::shared_ptr<QNNModel> QNNBackend::createQnnGraph (const std::string& graphName) {
440441 // If the graph already exists, return the existing model
441442 if (qnnModelIndexMap_.find (graphName) != qnnModelIndexMap_.end ()) {
@@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
535536 return ;
536537 }
537538
538- // Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers
539- // This handles the case where input tensor sizes may differ between prefill and decode phases
540539 std::vector<Qnn_Tensor_t> qnn_inputs;
541540 std::vector<Qnn_Tensor_t> qnn_outputs;
542541 for (int i = 0 ; i < model->getGraphInputTensorWrappers ().size (); i++) {
@@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
550549 return ;
551550 }
552551
553- if (wrapper_tensor.isNil ()) {
554- MLLM_ERROR (" Graph input wrapper {} for graph '{}' has no backing tensor" , i, graphName);
555- return ;
556- }
557-
558- // Check for size mismatches (can occur in decode phase where inputs may be smaller)
559- size_t dst_bytes = wrapper_tensor.bytes ();
560- size_t src_bytes = runtime_input.bytes ();
561- if (dst_bytes != src_bytes) {
562- MLLM_WARN (" Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying "
563- " min(dst, src), but this may truncate data." ,
564- graphName, i, dst_bytes, src_bytes);
565- }
566-
567- if (dst_bytes > 0 ) {
568- void * dst_ptr = wrapper_tensor.ptr <void >();
569- if (!dst_ptr) {
570- wrapper_tensor.alloc ();
571- dst_ptr = wrapper_tensor.ptr <void >();
572- }
573-
574- const void * src_ptr = runtime_input.ptr <void >();
575- size_t bytes_to_copy = std::min (dst_bytes, src_bytes);
576- if (!src_ptr) {
577- MLLM_ERROR (" Runtime input tensor {} for graph '{}' has null data pointer" , i, graphName);
578- return ;
579- }
580- if (dst_ptr && src_ptr && dst_ptr != src_ptr) {
581- // Copy source data to destination buffer
582- // This ensures that the graph input wrapper has the correct data for execution
583- if (bytes_to_copy > 0 ) { std::memcpy (dst_ptr, src_ptr, bytes_to_copy); }
584-
585- // If source is smaller than destination, zero out the remaining bytes
586- // This is important for decode phase where input tensors may be smaller than prefill
587- // For example, decode phase may use [1, 1] input while wrapper expects [1, 128]
588- // Note: In current implementation with full [1, 128] tensor, this should not trigger
589- // but it's kept as a safety measure for future optimizations
590- if (src_bytes < dst_bytes) {
591- size_t remaining_bytes = dst_bytes - src_bytes;
592- std::memset (static_cast <char *>(dst_ptr) + bytes_to_copy, 0 , remaining_bytes);
593- // Only log if zero-padding actually occurs (unexpected case)
594- MLLM_WARN (" [QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)" ,
595- graphName, i, remaining_bytes, src_bytes, dst_bytes);
596- }
597- }
598- }
552+ // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
553+ if (!wrapper->isAlloc ()) { wrapper->__setDataContainer (runtime_input); }
599554
600555 // Allocate and register the wrapper tensor with QNN allocator
601556 // QNNAllocator will handle registered memory descriptor when needed
@@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
617572
618573 if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo (runtime_->profileHandle ); }
619574
620- // Debug: Print last output shape from QNN actual return order (before reordering)
621- // Uncomment below for debugging output order issues
622- // if (!qnn_output_tensors.empty()) {
623- // const auto& last_output = qnn_output_tensors.back();
624- // const auto& output_wrappers = model->getGraphOutputTensorWrappers();
625- // const auto& last_wrapper = output_wrappers.back();
626- // MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}",
627- // last_wrapper->getName(), last_output.shape());
628- // }
629-
630575 // Reorder outputs according to MLLM expected order
631576 const auto & expectedOrder = model->getExpectedOutputOrder ();
632577
633578 // Resize outputs to match QNN output count first
634579 outputs.resize (qnn_output_tensors.size ()); // Ensure outputs has enough space for all QNN outputs
635580 if (!expectedOrder.empty () && expectedOrder.size () == qnn_output_tensors.size ()) {
636- // Debug: Log output order information
637- // Uncomment below for debugging output order issues
638- // MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName);
639- // MLLM_INFO(" MLLM Expected Output Order ({} outputs):", expectedOrder.size());
640- // for (size_t i = 0; i < expectedOrder.size(); i++) {
641- // MLLM_INFO(" [{}] {}", i, expectedOrder[i]);
642- // }
643- // MLLM_INFO(" QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size());
644- // for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) {
645- // auto wrapper = model->getGraphOutputTensorWrappers()[i];
646- // MLLM_INFO(" [{}] {}", i, wrapper->getName());
647- // }
648-
649- // Check if reordering is needed
650- // bool needs_reordering = false;
651- // std::vector<std::pair<size_t, int>> mismatches;
652- // for (size_t i = 0; i < expectedOrder.size(); i++) {
653- // const std::string& expected_name = expectedOrder[i];
654- // int qnn_index = model->getQnnOutputIndex(expected_name);
655- // if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
656- // if (static_cast<int>(i) != qnn_index) {
657- // needs_reordering = true;
658- // mismatches.emplace_back(i, qnn_index);
659- // }
660- // }
661- // }
662-
663- // Debug: Verification messages
664- // Uncomment below for debugging output order issues
665- // if (needs_reordering) {
666- // MLLM_INFO(" [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED");
667- // for (const auto& [mllm_idx, qnn_idx] : mismatches) {
668- // MLLM_INFO(" Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]",
669- // mllm_idx, expectedOrder[mllm_idx], qnn_idx);
670- // }
671- // } else {
672- // MLLM_INFO(" [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed");
673- // }
674-
675581 // Reorder outputs according to expected order
676582 for (size_t i = 0 ; i < expectedOrder.size (); i++) {
677583 const std::string& expected_name = expectedOrder[i];
678584 int qnn_index = model->getQnnOutputIndex (expected_name);
679585 if (qnn_index >= 0 && qnn_index < static_cast <int >(qnn_output_tensors.size ())) {
680586 outputs[i] = qnn_output_tensors[qnn_index];
681- // Debug: Mapping information
682- // Uncomment below for debugging output order issues
683- // if (static_cast<int>(i) != qnn_index) {
684- // MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name);
685- // } else {
686- // MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name);
687- // }
688587 } else {
689588 MLLM_ERROR (" QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'" , expected_name,
690589 graphName);
0 commit comments