[CUDA] Isolate unified api to only in cubin launcher (#408)

tqchen · junrushao · web-flow · commit 10cb0048cef8 · 2026-01-12T18:08:36.000-08:00
This PR isolates out the unified api to be only local to cubin launcher.

Background: it is generally error-prone to mix the driver and runtime
API. The particular unified api switch was mainly meant to be used in
cubin launcher for a narrow set of cuda versions(around 12.8 ish to
13.0).

However, we would like the most generic macros like
TVM_FFI_CHECK_CUDA_ERROR to be specific to runtime API. We should
revisit if we should simply deprecate driver API usages for better
maintainability.

---------

Co-authored-by: Junru Shao &lt;junrus@nvidia.com&gt;
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
@@ -51,7 +51,7 @@ jobs:
         id: cpp_files
         run: |
           FILES=$(git diff --name-only --diff-filter=ACMR origin/${{ github.base_ref }}...HEAD -- \
-            '*.c' '*.cc' '*.cpp' '*.cxx' | tr '\n' ' ')
+            src/ tests/ | grep -E '\.(c|cc|cpp|cxx)$' | tr '\n' ' ')
           echo "files=$FILES" >> $GITHUB_OUTPUT
           [ -n "$FILES" ] && echo "changed=true" >> $GITHUB_OUTPUT || echo "changed=false" >> $GITHUB_OUTPUT
 
diff --git a/docs/guides/cubin_launcher.rst b/docs/guides/cubin_launcher.rst
@@ -376,7 +376,7 @@ To use dynamic shared memory, specify the size in the :cpp:func:`tvm::ffi::Cubin
 
    // Allocate 1KB of dynamic shared memory
    uint32_t shared_mem_bytes = 1024;
-   cudaError_t result = kernel.Launch(args, grid, block, stream, shared_mem_bytes);
+   TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream, shared_mem_bytes));
 
 Integration with Different Compilers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/examples/cubin_launcher/dynamic_cubin/src/lib_dynamic.cc b/examples/cubin_launcher/dynamic_cubin/src/lib_dynamic.cc
@@ -85,7 +85,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = g_add_one_kernel->Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 }  // namespace cubin_dynamic
@@ -125,7 +125,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = g_mul_two_kernel->Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 // Export TVM-FFI functions
diff --git a/examples/cubin_launcher/embedded_cubin/cpp_embed/src/lib_embedded.cc b/examples/cubin_launcher/embedded_cubin/cpp_embed/src/lib_embedded.cc
@@ -73,7 +73,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 }  // namespace cubin_embedded
@@ -112,7 +112,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 // Export TVM-FFI functions
diff --git a/examples/cubin_launcher/embedded_cubin/embed_with_tvm_ffi/src/lib_embedded.cc b/examples/cubin_launcher/embedded_cubin/embed_with_tvm_ffi/src/lib_embedded.cc
@@ -70,7 +70,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 }  // namespace cubin_embedded
@@ -109,7 +109,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 // Export TVM-FFI functions
diff --git a/examples/cubin_launcher/embedded_cubin/include_bin2c/src/lib_embedded.cc b/examples/cubin_launcher/embedded_cubin/include_bin2c/src/lib_embedded.cc
@@ -70,7 +70,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 }  // namespace cubin_embedded
@@ -109,7 +109,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
 
   // Launch kernel
   tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 // Export TVM-FFI functions
diff --git a/examples/cubin_launcher/example_nvrtc_cubin.py b/examples/cubin_launcher/example_nvrtc_cubin.py
@@ -129,8 +129,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
     cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
 
     // Launch kernel
-    cudaError_t result = kernel.Launch(args, grid, block, stream);
-    TVM_FFI_CHECK_CUDA_ERROR(result);
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
     }
 
     void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
@@ -158,8 +157,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
     cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
 
     // Launch kernel
-    cudaError_t result = kernel.Launch(args, grid, block, stream);
-    TVM_FFI_CHECK_CUDA_ERROR(result);
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
     }
 
     }  // namespace nvrtc_loader
diff --git a/examples/cubin_launcher/example_triton_cubin.py b/examples/cubin_launcher/example_triton_cubin.py
@@ -133,8 +133,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
     DLDevice device = x.device();
     cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
 
-    cudaError_t result = kernel.Launch(args, grid, block, stream);
-    TVM_FFI_CHECK_CUDA_ERROR(result);
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
     }
 
     }  // namespace triton_loader
diff --git a/include/tvm/ffi/extra/cuda/base.h b/include/tvm/ffi/extra/cuda/base.h
@@ -23,9 +23,31 @@
 #ifndef TVM_FFI_EXTRA_CUDA_BASE_H_
 #define TVM_FFI_EXTRA_CUDA_BASE_H_
 
+#include <cuda_runtime.h>
+#include <tvm/ffi/error.h>
+
 namespace tvm {
 namespace ffi {
 
+/*!
+ * \brief Macro for checking CUDA runtime API errors.
+ *
+ * This macro checks the return value of CUDA runtime API calls and throws
+ * a RuntimeError with detailed error information if the call fails.
+ *
+ * \param stmt The CUDA runtime API call to check.
+ */
+#define TVM_FFI_CHECK_CUDA_ERROR(stmt)                                              \
+  do {                                                                              \
+    cudaError_t __err = (stmt);                                                     \
+    if (__err != cudaSuccess) {                                                     \
+      const char* __err_name = cudaGetErrorName(__err);                             \
+      const char* __err_str = cudaGetErrorString(__err);                            \
+      TVM_FFI_THROW(RuntimeError) << "CUDA Runtime Error: " << __err_name << " ("   \
+                                  << static_cast<int>(__err) << "): " << __err_str; \
+    }                                                                               \
+  } while (0)
+
 /*!
  * \brief A simple 3D dimension type for CUDA kernel launch configuration.
  *
diff --git a/include/tvm/ffi/extra/cuda/cubin_launcher.h b/include/tvm/ffi/extra/cuda/cubin_launcher.h
@@ -29,7 +29,7 @@
 #ifndef TVM_FFI_EXTRA_CUDA_CUBIN_LAUNCHER_H_
 #define TVM_FFI_EXTRA_CUDA_CUBIN_LAUNCHER_H_
 
-#include <cuda.h>
+#include <cuda.h>  // NOLINT(clang-diagnostic-error)
 #include <cuda_runtime.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/extra/c_env_api.h>
@@ -234,7 +234,7 @@ namespace ffi {
  *       TVMFFIEnvGetStream(device.device_type, device.device_id));
  *
  *   cudaError_t result = kernel.Launch(args, grid, block, stream);
- *   TVM_FFI_CHECK_CUDA_ERROR(result);
+ *   TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
  * }
  * \endcode
  *
@@ -295,7 +295,7 @@ class CubinModule {
    * \param bytes CUBIN binary data as a Bytes object.
    */
   explicit CubinModule(const Bytes& bytes) {
-    TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, bytes.data()));
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, bytes.data()));
   }
 
   /*!
@@ -305,7 +305,7 @@ class CubinModule {
    * \note The `code` buffer points to an ELF image.
    */
   explicit CubinModule(const char* code) {
-    TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
   }
 
   /*!
@@ -315,7 +315,7 @@ class CubinModule {
    * \note The `code` buffer points to an ELF image.
    */
   explicit CubinModule(const unsigned char* code) {
-    TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
   }
 
   /*! \brief Destructor unloads the library */
@@ -418,7 +418,7 @@ class CubinModule {
  * // Launch on stream
  * cudaStream_t stream = ...;
  * cudaError_t result = kernel.Launch(args, grid, block, stream);
- * TVM_FFI_CHECK_CUDA_ERROR(result);
+ * TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
  * \endcode
  *
  * \note This class is movable but not copyable.
@@ -434,7 +434,7 @@ class CubinKernel {
    * \param name Name of the kernel function.
    */
   CubinKernel(cuda_api::LibraryHandle library, const char* name) {
-    TVM_FFI_CHECK_CUDA_ERROR(cuda_api::GetKernel(&kernel_, library, name));
+    TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::GetKernel(&kernel_, library, name));
   }
 
   /*! \brief Destructor (kernel handle doesn't need explicit cleanup) */
@@ -464,8 +464,7 @@ class CubinKernel {
    * \par Error Checking
    * Always check the returned cudaError_t:
    * \code{.cpp}
-   * cudaError_t result = kernel.Launch(args, grid, block, stream);
-   * TVM_FFI_CHECK_CUDA_ERROR(result);
+   * TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
    * \endcode
    *
    * \param args Array of pointers to kernel arguments (must point to actual values).
diff --git a/include/tvm/ffi/extra/cuda/device_guard.h b/include/tvm/ffi/extra/cuda/device_guard.h
@@ -23,7 +23,7 @@
 #ifndef TVM_FFI_EXTRA_CUDA_DEVICE_GUARD_H_
 #define TVM_FFI_EXTRA_CUDA_DEVICE_GUARD_H_
 
-#include <tvm/ffi/extra/cuda/internal/unified_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 
 namespace tvm {
 namespace ffi {
diff --git a/include/tvm/ffi/extra/cuda/internal/unified_api.h b/include/tvm/ffi/extra/cuda/internal/unified_api.h
@@ -74,7 +74,7 @@ using DeviceAttrType = CUdevice_attribute;
 constexpr ResultType kSuccess = CUDA_SUCCESS;
 
 // Driver API Functions
-#define _TVM_FFI_CUDA_FUNC(name) cu##name
+#define _TVM_FFI_CUDA_FUNC(name) cu##name  // NOLINT(bugprone-reserved-identifier)
 
 #else
 
@@ -110,7 +110,9 @@ inline void GetErrorString(ResultType err, const char** name, const char** str)
 #endif
 }
 
-#define TVM_FFI_CHECK_CUDA_ERROR(stmt)                                              \
+// this macro is only used to check cuda errors in cubin launcher where
+// we might switch between driver and runtime API.
+#define TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(stmt)                               \
   do {                                                                              \
     ::tvm::ffi::cuda_api::ResultType __err = (stmt);                                \
     if (__err != ::tvm::ffi::cuda_api::kSuccess) {                                  \
@@ -143,7 +145,7 @@ inline DeviceHandle GetDeviceHandle(int device_id) {
   CUdevice dev;
   // Note: We use CHECK here because this conversion usually shouldn't fail if ID is valid
   // and we need to return a value.
-  TVM_FFI_CHECK_CUDA_ERROR(cuDeviceGet(&dev, device_id));
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuDeviceGet(&dev, device_id));
   return dev;
 #else
   return device_id;
diff --git a/tests/python/test_cubin_launcher.py b/tests/python/test_cubin_launcher.py
@@ -158,8 +158,8 @@ def test_cubin_launcher_add_one() -> None:
   DLDevice device = x.device();
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
 
-  cudaError_t result = g_kernel_add_one->Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  auto result = g_kernel_add_one->Launch(args, grid, block, stream);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 void LaunchMulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
@@ -184,8 +184,8 @@ def test_cubin_launcher_add_one() -> None:
   DLDevice device = x.device();
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
 
-  cudaError_t result = g_kernel_mul_two->Launch(args, grid, block, stream);
-  TVM_FFI_CHECK_CUDA_ERROR(result);
+  auto result = g_kernel_mul_two->Launch(args, grid, block, stream);
+  TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
 }
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(load_cubin_data, cubin_test::LoadCubinData);

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`85`	`85`
`86`	`86`	`// Launch kernel`
`87`	`87`	`tvm::ffi::cuda_api::ResultType result = g_add_one_kernel->Launch(args, grid, block, stream);`
`88`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`88`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`} // namespace cubin_dynamic`
`@@ -125,7 +125,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`125`	`125`
`126`	`126`	`// Launch kernel`
`127`	`127`	`tvm::ffi::cuda_api::ResultType result = g_mul_two_kernel->Launch(args, grid, block, stream);`
`128`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`128`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`129`	`129`	`}`
`130`	`130`
`131`	`131`	`// Export TVM-FFI functions`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`73`	`73`
`74`	`74`	`// Launch kernel`
`75`	`75`	`tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);`
`76`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`76`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`} // namespace cubin_embedded`
`@@ -112,7 +112,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`112`	`112`
`113`	`113`	`// Launch kernel`
`114`	`114`	`tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);`
`115`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`115`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`116`	`116`	`}`
`117`	`117`
`118`	`118`	`// Export TVM-FFI functions`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`70`	`70`
`71`	`71`	`// Launch kernel`
`72`	`72`	`tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);`
`73`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`73`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`74`	`74`	`}`
`75`	`75`
`76`	`76`	`} // namespace cubin_embedded`
`@@ -109,7 +109,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {`
`109`	`109`
`110`	`110`	`// Launch kernel`
`111`	`111`	`tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);`
`112`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`112`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);`
`113`	`113`	`}`
`114`	`114`
`115`	`115`	`// Export TVM-FFI functions`
Original file line number	Diff line number	Diff line change
`@@ -133,8 +133,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:`
`133`	`133`	`DLDevice device = x.device();`
`134`	`134`	`cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));`
`135`	`135`
`136`		`- cudaError_t result = kernel.Launch(args, grid, block, stream);`
`137`		`- TVM_FFI_CHECK_CUDA_ERROR(result);`
	`136`	`+ TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));`
`138`	`137`	`}`
`139`	`138`
`140`	`139`	`} // namespace triton_loader`