-
Notifications
You must be signed in to change notification settings - Fork 908
Description
π Describe the bug
Hi! I tried to build the main.cpp application code of mnist digit recognizer on RISC-V architecture using executorch. The main.cpp code is located inside the executorch/examples/raspberry_pi/pico2 directory. I am building the executorch core libraries inside the executorch/cmake-out-riscv directory. I am trying a baremetal build with the below flags turned on:
cmake .. -DCMAKE_TOOLCHAIN_FILE=../examples/raspberry_pi/pico2/riscv_toolchain.cmake -DEXECUTORCH_ENABLE_LOGGING=ON -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON -DEXECUTORCH_PAL_DEFAULT=minimal -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF -DEXECUTORCH_SELECT_OPS_LIST="aten::permute_copy.out, aten::addmm.out,aten::relu.out"
make -j 4
It successfully built the ExecuTorch core libraries. Its log is shown below:
[ 96%] Building CXX object kernels/portable/CMakeFiles/portable_kernels.dir/cpu/util/stack_util.cpp.obj
[ 96%] Building CXX object kernels/portable/CMakeFiles/portable_kernels.dir/cpu/util/upsample_util.cpp.obj
[ 96%] Linking CXX static library libportable_kernels.a
[ 96%] Built target portable_kernels
[ 96%] Generating selected_operators.yaml for executorch_selected_kernels
[ 96%] Generating selected_operators.yaml for portable_ops_lib
aten::permute_copy.out,\ aten::addmm.out,aten::relu.out
[ 97%] Generating code for kernel registration
[ 98%] Generating code for kernel registration
[ 98%] Building CXX object CMakeFiles/executorch_selected_kernels.dir/executorch_selected_kernels/RegisterCodegenUnboxedKernelsEverything.cpp.obj
[ 98%] Building CXX object kernels/portable/CMakeFiles/portable_ops_lib.dir/portable_ops_lib/RegisterCodegenUnboxedKernelsEverything.cpp.obj
[ 99%] Linking CXX static library libexecutorch_selected_kernels.a
[ 99%] Built target executorch_selected_kernels
[100%] Linking CXX static library libportable_ops_lib.a
[100%] Built target portable_ops_lib
My riscv_toolchain.cmake file is shown below:
set(CMAKE_SYSTEM_NAME Generic)
set(CMAKE_SYSTEM_PROCESSOR riscv)
set(CMAKE_C_COMPILER /opt/riscv/bin/riscv64-unknown-elf-gcc)
set(CMAKE_CXX_COMPILER /opt/riscv/bin/riscv64-unknown-elf-g++)
set(CMAKE_ASM_COMPILER /opt/riscv/bin/riscv64-unknown-elf-gcc)
set(CMAKE_C_FLAGS "-march=rv64imafd_zicsr_zifencei -mabi=lp64d -mcmodel=medany --specs=sim.specs -Os -Og -ggdb -static -std=gnu99 -fno-common -fno-builtin-printf -fno-builtin-memcpy -fno-builtin-memset -lm -lgcc" CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS "-march=rv64imafd_zicsr_zifencei -mabi=lp64d -mcmodel=medany --specs=sim.specs -Os -Og -ggdb -static -fno-common -fno-builtin-printf -fno-builtin-memcpy -fno-builtin-memset -lstdc++ -lm" CACHE STRING "" FORCE)My CMakeLists.txt file is located inside the executorch/examples/raspberry_pi/pico2 directory. It is shown below:
cmake_minimum_required(VERSION 3.25)
project(ExecuTorch_RISCV_Runner C CXX ASM)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Path to the ExecuTorch repository
set(HOME_DIRECTORY /home/user_name)
set(EXECUTORCH_ROOT ${HOME_DIRECTORY}/executorch)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(MODEL_PTE_C "${CMAKE_CURRENT_SOURCE_DIR}/model_pte.c")
# Get the parent directory (/home/vaibhav) so the compiler can resolve <executorch/...>
get_filename_component(EXECUTORCH_PARENT_DIR ${EXECUTORCH_ROOT} DIRECTORY)
add_executable(
riscv_mnist_runner main.cpp ${MODEL_PTE_C} # Use the full path to be explicit
)
# 4. Include directories
target_include_directories(riscv_mnist_runner PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${EXECUTORCH_PARENT_DIR}
${EXECUTORCH_ROOT}/third-party
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
${HOME_DIRECTORY}/SecureIoT_ISP/SecureIoT_Apps/include
)
target_compile_definitions(
riscv_mnist_runner
PRIVATE C10_USING_CUSTOM_GENERATED_MACROS EXECUTORCH_ENABLE_LOGGING=OFF
EXECUTORCH_PAL_DEFAULT=minimal
)
# # Optimization flags
target_compile_options(
riscv_mnist_runner PRIVATE -Os -ffunction-sections -fdata-sections -mcmodel=medany
)
target_link_options(riscv_mnist_runner PRIVATE -mcmodel=medany)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
set(EXECUTORCH_BUILD_DIR ${EXECUTORCH_ROOT}/cmake-out-riscv)
#our SDK's .a file
set(SECUREIOT_SDK_LIB "${HOME_DIRECTORY}/SecureIoT_ISP/SecureIoT_SDK/lib/libsecureiot.a")
#our linker script link.ld
target_link_options(riscv_mnist_runner PRIVATE -T${HOME_DIRECTORY}/SecureIoT_ISP/SecureIoT_Apps/scripts/link.ld -nostartfiles)
# 5. Link the core ExecuTorch library
target_link_libraries(riscv_mnist_runner
PRIVATE
${SECUREIOT_SDK_LIB}
${EXECUTORCH_BUILD_DIR}/libexecutorch.a
${EXECUTORCH_BUILD_DIR}/libexecutorch_core.a
${EXECUTORCH_BUILD_DIR}/libexecutorch_selected_kernels.a
#-Wl,--whole-archive
${EXECUTORCH_BUILD_DIR}/kernels/portable/libportable_ops_lib.a
#-Wl,--no-whole-archive
${EXECUTORCH_BUILD_DIR}/kernels/portable/libportable_kernels.a
)
set_target_properties(riscv_mnist_runner PROPERTIES SUFFIX ".elf")
add_custom_command(
TARGET riscv_mnist_runner POST_BUILD
COMMAND riscv64-unknown-elf-elf2bin
--input riscv_mnist_runner.elf
--output riscv_mnist_runner.bin
COMMENT "Converting ELF to BIN using elf2bin"
)
add_custom_command(
TARGET riscv_mnist_runner POST_BUILD
COMMAND riscv64-unknown-elf-objdump
-d riscv_mnist_runner.elf > riscv_mnist_runner.disass
COMMENT "GETTING THE OBJECT DUMP using objdump"
)In the above CMakeLists.txt file I commented out -Wl,--whole-archive flag as it bloats the size of the binary executable.
Inside the executorch/examples/raspberry_pi/pico2 directory, I ran:
cmake -B build/ -DCMAKE_TOOLCHAIN_FILE=riscv_toolchain.cmake Inside the executorch/examples/raspberry_pi/pico2/build directory, I ran:
make -j 4 It successfully built the binary. Its log is shown below:
[ 33%] Building CXX object CMakeFiles/riscv_mnist_runner.dir/main.cpp.obj
[ 66%] Building C object CMakeFiles/riscv_mnist_runner.dir/model_pte.c.obj
[100%] Linking CXX executable riscv_mnist_runner.elf
/opt/riscv/lib/gcc/riscv64-unknown-elf/15.2.0/../../../../riscv64-unknown-elf/bin/ld: warning: riscv_mnist_runner.elf has a LOAD segment with RWX permissions
Converting ELF to BIN using elf2bin
GETTING THE OBJECT DUMP using objdump
The main.cpp application code of mnist digit recognizer for digits 0,1,4,7 inside executorch/examples/raspberry_pi/pico2 directory is shown below:
// Model data
#include "model_pte.h"
// // Standard C/C++ includes
#include <memory>
// // Executorch includes
#include <executorch/extension/data_loader/buffer_data_loader.h>
#include <executorch/runtime/core/portable_type/scalar_type.h>
#include <executorch/runtime/executor/memory_manager.h>
#include <executorch/runtime/executor/method.h>
#include <executorch/runtime/executor/program.h>
#include <executorch/runtime/platform/runtime.h>
using namespace executorch::runtime;
using executorch::aten::Tensor;
using executorch::aten::TensorImpl;
using ScalarType = executorch::runtime::etensor::ScalarType;
using executorch::runtime::runtime_init;
#include "io.h"
bool load_and_prepare_model(
std::unique_ptr<Program>& program_ptr,
std::unique_ptr<Method>& method_ptr,
MemoryManager& memory_manager) {
printf("Loading model data (%u bytes)...\n", (unsigned int)model_pte_len);
executorch::extension::BufferDataLoader loader(model_pte, model_pte_len);
auto program_result = Program::load(&loader);
if (!program_result.ok()) {
printf("β Failed to load model: error %d\n", (int)program_result.error());
// Print more detailed error info
switch (program_result.error()) {
case Error::InvalidProgram:
printf(" β Invalid program format\n");
break;
case Error::InvalidState:
printf(" β Invalid state\n");
break;
case Error::NotSupported:
printf(" β Feature not supported\n");
break;
case Error::NotFound:
printf(" β Resource not found\n");
break;
case Error::InvalidArgument:
printf(" β Invalid argument\n");
break;
default:
printf(" β Unknown error code: %d\n", (int)program_result.error());
}
return false;
}
program_ptr = std::make_unique<Program>(std::move(*program_result));
printf("β
Program loaded successfully\n");
// Get method count and names
printf("π Program info:\n");
printf(" Method count: %lu \n", (unsigned long)program_ptr->num_methods());
auto method_name_result = program_ptr->get_method_name(0);
if (!method_name_result.ok()) {
printf(
"β Failed to get method name: error %d\n",
(int)method_name_result.error());
return false;
}
printf(" Method 0 name: %s\n", *method_name_result);
// Try to load the method - this is where operator errors usually happen
printf("π Loading method '%s'...\n", *method_name_result);
auto method_result =
program_ptr->load_method(*method_name_result, &memory_manager);
if (!method_result.ok()) {
printf("β Failed to load method: error %d\n", (int)method_result.error());
// More detailed method loading errors
switch (method_result.error()) {
case Error::InvalidProgram:
printf(" β Method has invalid program structure\n");
break;
case Error::InvalidState:
printf(" β Method in invalid state\n");
break;
case Error::NotSupported:
printf(" β Method uses unsupported operators\n");
printf(
" β This usually means missing operators in selective build!\n");
break;
case Error::NotFound:
printf(" β Method resource not found\n");
break;
case Error::MemoryAllocationFailed:
printf(" β Not enough memory to load method\n");
break;
case Error::OperatorMissing:
printf(" β Operator missing\n");
break;
default:
printf(" β Unknown method error: %d\n", (int)method_result.error());
}
return false;
}
method_ptr = std::make_unique<Method>(std::move(*method_result));
printf("β
Method '%s' loaded successfully\n", *method_name_result);
return true;
}
bool run_inference(Method& method) {
printf(
"π₯ ExecuTorch MLP MNIST Demo (Neural network) on Bare-metal RISC-V π₯\n");
// ASCII art for digit '0' (28x28)
const char* ascii_digit_0[28] = {
" ", " ############ ",
" ################## ", " ###################### ",
" ######################## ", " #### #### ",
" #### #### ", " #### #### ",
"#### ####", "#### ####",
"#### ####", "#### ####",
"#### ####", "#### ####",
"#### ####", "#### ####",
"#### ####", "#### ####",
"#### ####", "#### ####",
" #### #### ", " #### #### ",
" #### #### ", " ######################## ",
" ###################### ", " ################## ",
" ############ ", " "};
const char* ascii_digit_1[28] = {
" #### ", " ##### ",
" ###### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" ############ ", " ############ ",
" ############ ", " "};
const char* ascii_digit_4[28] = {
" ", " #### ",
" ##### ", " ###### ",
" ####### ", " #### #### ",
" #### #### ", " #### #### ",
" #### #### ", " #### #### ",
" #### #### ", " #### #### ",
" #### #### ", " #### #### ",
" ###################### ", " ###################### ",
" ###################### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " "};
const char* ascii_digit_7[28] = {
"############################", "############################",
" ####", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
" #### ", " #### ",
"#### ", "### "};
// Test patterns
struct TestCase {
const char** pattern;
const char* name;
int expected_digit;
};
TestCase test_cases[] = {
{ascii_digit_0, "Digit 0", 0},
{ascii_digit_1, "Digit 1", 1},
{ascii_digit_4, "Digit 4", 4},
{ascii_digit_7, "Digit 7", 7}};
printf("π§ͺ Testing all supported digits:\n\n");
for (int test = 0; test < 4; test++) {
const char** ascii_digit = test_cases[test].pattern;
const char* digit_name = test_cases[test].name;
int expected = test_cases[test].expected_digit;
// Display the ASCII digit
printf("=== %s ===\n", digit_name);
for (int i = 0; i < 28; i++) {
printf("%s\n", ascii_digit[i]);
}
printf("\n");
// Convert ASCII to 28x28 float tensor
float input_data[784]; // 28*28 = 784
for (int row = 0; row < 28; row++) {
for (int col = 0; col < 28; col++) {
char pixel = ascii_digit[row][col];
input_data[row * 28 + col] = (pixel == '#') ? 1.0f : 0.0f;
}
}
// Count white pixels
int white_pixels = 0;
for (int i = 0; i < 784; i++) {
if (input_data[i] > 0.5f)
white_pixels++;
}
printf("Input stats: %d white pixels out of 784 total\n", white_pixels);
// Create input tensor: [1, 28, 28]
TensorImpl::SizesType input_sizes[3] = {1, 28, 28};
TensorImpl::DimOrderType dim_order[3] = {0, 1, 2};
TensorImpl input_impl(
ScalarType::Float,
3, // 3 dimensions: [batch, height, width]
input_sizes, // [1, 28, 28]
input_data,
dim_order);
Tensor input(&input_impl);
// Set input and run inference
printf("Running neural network inference...\n");
auto result = method.set_input(input, 0);
if (result != Error::Ok) {
printf("β Failed to set input: error %d\n", (int)result);
return false;
}
result = method.execute();
if (result != Error::Ok) {
printf("β Failed to execute: error %d\n", (int)result);
return false;
}
auto output_evalue = method.get_output(0);
if (!output_evalue.isTensor()) {
printf("β Output is not a tensor\n");
return false;
}
// Extract tensor from EValue
Tensor output = output_evalue.toTensor();
float* output_data = output.mutable_data_ptr<float>();
// Find digit with highest score
int predicted_digit = 0;
float max_score = output_data[0];
for (int i = 1; i < 10; i++) {
if (output_data[i] > max_score) {
max_score = output_data[i];
predicted_digit = i;
}
}
// Display results
printf("β
Neural network results:\n");
for (int i = 0; i < 10; i++) {
printf(" Digit %d: %.3f", i, output_data[i]);
if (i == predicted_digit)
printf(" β PREDICTED");
printf("\n");
}
// Check if correct
printf("\nπ― PREDICTED: %d (Expected: %d) ", predicted_digit, expected);
if (predicted_digit == expected) {
printf("β
CORRECT!\n");
} else {
printf("β WRONG!\n");
}
printf("\n==================================================\n\n");
}
printf(
"π All tests complete! ExecuTorch inference of neural network works on RISC-V!\n");
return true;
}
int main() {
printf("HELLO WORLD FROM EXECUTORCH WITH EXECUTORCH RUNTIME !!!!!!\n");
runtime_init();
// // Allocation memory pools for bare-metal limits
static uint8_t method_allocator_pool[200 * 1024]; // 200KB - plenty for method metadata
static uint8_t activation_pool[200 * 1024]; // 200KB - plenty for activations
MemoryAllocator method_allocator(
sizeof(method_allocator_pool), method_allocator_pool);
method_allocator.enable_profiling("method allocator");
Span<uint8_t> memory_planned_buffers[1]{
{activation_pool, sizeof(activation_pool)}};
HierarchicalAllocator planned_memory({memory_planned_buffers, 1});
MemoryManager memory_manager(&method_allocator, &planned_memory);
printf("PROGRAM HAS GONE PAST MEMORY ALLOCATOR !!! \n");
std::unique_ptr<Program> program_ptr;
std::unique_ptr<Method> method_ptr;
printf("PROGRAM IS USING UNIQUE POINTER !!! \n");
if (!load_and_prepare_model(program_ptr, method_ptr, memory_manager)) {
printf("Failed to load and prepare model\n");
return 1;
}
if (!run_inference(*method_ptr)) {
printf("Failed to run inference\n");
return 1;
}
return 0;
}
I got the below prints after executing the riscv_mnist_runner.elf binary on the FPGA:
HELLO`WORLD FROM EXECUTORCH WITH EXECUTORCH RUNTIME !!!!!!
PROGRAM HAS GONE PAST MEMORY ALLOCATOR !!!
PROGRAM IS USING UNIQUE POINTER !!!
Loading model data (106216 bytes)...
β
Program loaded successfully
π Program info:
Method count: 1
Method 0 name: forward
π Loading method 'forward'...
β Failed to load method: error 20
β Operator missing
Failed to load and prepare model
I don't know why I am getting the above "Operator missing" error.
I tried to build this without the -DEXECUTORCH_SELECT_OPS_LIST flag. I still got the same error.
Versions
Collecting environment information...
PyTorch version: 2.11.0+cu130
Is debug build: False
CUDA used to build PyTorch: 13.0
ROCM used to build PyTorch: N/A
OS: Ubuntu 24.04.3 LTS (x86_64)
GCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0
Clang version: Could not collect
CMake version: version 3.28.3
Libc version: glibc-2.39
Python version: 3.12.13 | packaged by Anaconda, Inc. | (main, Mar 19 2026, 20:20:58) [GCC 14.3.0] (64-bit runtime)
Python platform: Linux-6.17.0-19-generic-x86_64-with-glibc2.39
Is CUDA available: False
CUDA runtime version: No CUDA
CUDA_MODULE_LOADING set to: N/A
GPU models and configuration: No CUDA
Nvidia driver version: No CUDA
cuDNN version: No CUDA
Is XPU available: False
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
Caching allocator config: N/A
CPU:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 48 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 12
On-line CPU(s) list: 0-11
Vendor ID: AuthenticAMD
Model name: AMD Ryzen 5 8600G w/ Radeon 760M Graphics
CPU family: 25
Model: 117
Thread(s) per core: 2
Core(s) per socket: 6
Socket(s): 1
Stepping: 2
Frequency boost: enabled
CPU(s) scaling MHz: 63%
CPU max MHz: 5076.1670
CPU min MHz: 414.3810
BogoMIPS: 8700.70
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good amd_lbr_v2 nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpuid_fault cpb cat_l3 cdp_l3 hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local user_shstk avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic vgif x2avic v_spec_ctrl vnmi avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca fsrm flush_l1d
Virtualization: AMD-V
L1d cache: 192 KiB (6 instances)
L1i cache: 192 KiB (6 instances)
L2 cache: 6 MiB (6 instances)
L3 cache: 16 MiB (1 instance)
NUMA node(s): 1
NUMA node0 CPU(s): 0-11
Vulnerability Gather data sampling: Not affected
Vulnerability Ghostwrite: Not affected
Vulnerability Indirect target selection: Not affected
Vulnerability Itlb multihit: Not affected
Vulnerability L1tf: Not affected
Vulnerability Mds: Not affected
Vulnerability Meltdown: Not affected
Vulnerability Mmio stale data: Not affected
Vulnerability Old microcode: Not affected
Vulnerability Reg file data sampling: Not affected
Vulnerability Retbleed: Not affected
Vulnerability Spec rstack overflow: Mitigation; Safe RET
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; PBRSB-eIBRS Not affected; BHI Not affected
Vulnerability Srbds: Not affected
Vulnerability Tsa: Vulnerable: No microcode
Vulnerability Tsx async abort: Not affected
Vulnerability Vmscape: Mitigation; IBPB before exit to userspace
Versions of relevant libraries:
[pip3] executorch==1.1.0
[pip3] numpy==2.4.3
[pip3] nvidia-cublas==13.1.0.3
[pip3] nvidia-cuda-cupti==13.0.85
[pip3] nvidia-cuda-nvrtc==13.0.88
[pip3] nvidia-cuda-runtime==13.0.96
[pip3] nvidia-cudnn-cu13==9.19.0.56
[pip3] nvidia-cufft==12.0.0.61
[pip3] nvidia-curand==10.4.0.35
[pip3] nvidia-cusolver==12.0.4.66
[pip3] nvidia-cusparse==12.6.3.3
[pip3] nvidia-cusparselt-cu13==0.8.0
[pip3] nvidia-nccl-cu13==2.28.9
[pip3] nvidia-nvjitlink==13.0.88
[pip3] nvidia-nvtx==13.0.85
[pip3] pytorch_tokenizers==1.1.0
[pip3] torch==2.11.0
[pip3] torchao==0.15.0
[pip3] torchvision==0.26.0
[pip3] triton==3.6.0
[conda] executorch 1.1.0 pypi_0 pypi
[conda] numpy 2.4.3 pypi_0 pypi
[conda] nvidia-cublas 13.1.0.3 pypi_0 pypi
[conda] nvidia-cuda-cupti 13.0.85 pypi_0 pypi
[conda] nvidia-cuda-nvrtc 13.0.88 pypi_0 pypi
[conda] nvidia-cuda-runtime 13.0.96 pypi_0 pypi
[conda] nvidia-cudnn-cu13 9.19.0.56 pypi_0 pypi
[conda] nvidia-cufft 12.0.0.61 pypi_0 pypi
[conda] nvidia-curand 10.4.0.35 pypi_0 pypi
[conda] nvidia-cusolver 12.0.4.66 pypi_0 pypi
[conda] nvidia-cusparse 12.6.3.3 pypi_0 pypi
[conda] nvidia-cusparselt-cu13 0.8.0 pypi_0 pypi
[conda] nvidia-nccl-cu13 2.28.9 pypi_0 pypi
[conda] nvidia-nvjitlink 13.0.88 pypi_0 pypi
[conda] nvidia-nvtx 13.0.85 pypi_0 pypi
[conda] pytorch-tokenizers 1.1.0 pypi_0 pypi
[conda] torch 2.11.0 pypi_0 pypi
[conda] torchao 0.15.0 pypi_0 pypi
[conda] torchvision 0.26.0 pypi_0 pypi
[conda] triton 3.6.0 pypi_0 pypi