Skip to content

Commit 10cb004

Browse files
tqchenjunrushao
andauthored
[CUDA] Isolate unified api to only in cubin launcher (#408)
This PR isolates out the unified api to be only local to cubin launcher. Background: it is generally error-prone to mix the driver and runtime API. The particular unified api switch was mainly meant to be used in cubin launcher for a narrow set of cuda versions(around 12.8 ish to 13.0). However, we would like the most generic macros like TVM_FFI_CHECK_CUDA_ERROR to be specific to runtime API. We should revisit if we should simply deprecate driver API usages for better maintainability. --------- Co-authored-by: Junru Shao <[email protected]>
1 parent 91fcaa8 commit 10cb004

File tree

13 files changed

+53
-33
lines changed

13 files changed

+53
-33
lines changed

.github/workflows/ci_test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
id: cpp_files
5252
run: |
5353
FILES=$(git diff --name-only --diff-filter=ACMR origin/${{ github.base_ref }}...HEAD -- \
54-
'*.c' '*.cc' '*.cpp' '*.cxx' | tr '\n' ' ')
54+
src/ tests/ | grep -E '\.(c|cc|cpp|cxx)$' | tr '\n' ' ')
5555
echo "files=$FILES" >> $GITHUB_OUTPUT
5656
[ -n "$FILES" ] && echo "changed=true" >> $GITHUB_OUTPUT || echo "changed=false" >> $GITHUB_OUTPUT
5757

docs/guides/cubin_launcher.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ To use dynamic shared memory, specify the size in the :cpp:func:`tvm::ffi::Cubin
376376
377377
// Allocate 1KB of dynamic shared memory
378378
uint32_t shared_mem_bytes = 1024;
379-
cudaError_t result = kernel.Launch(args, grid, block, stream, shared_mem_bytes);
379+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream, shared_mem_bytes));
380380
381381
Integration with Different Compilers
382382
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

examples/cubin_launcher/dynamic_cubin/src/lib_dynamic.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
8585

8686
// Launch kernel
8787
tvm::ffi::cuda_api::ResultType result = g_add_one_kernel->Launch(args, grid, block, stream);
88-
TVM_FFI_CHECK_CUDA_ERROR(result);
88+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
8989
}
9090

9191
} // namespace cubin_dynamic
@@ -125,7 +125,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
125125

126126
// Launch kernel
127127
tvm::ffi::cuda_api::ResultType result = g_mul_two_kernel->Launch(args, grid, block, stream);
128-
TVM_FFI_CHECK_CUDA_ERROR(result);
128+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
129129
}
130130

131131
// Export TVM-FFI functions

examples/cubin_launcher/embedded_cubin/cpp_embed/src/lib_embedded.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
7373

7474
// Launch kernel
7575
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
76-
TVM_FFI_CHECK_CUDA_ERROR(result);
76+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
7777
}
7878

7979
} // namespace cubin_embedded
@@ -112,7 +112,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
112112

113113
// Launch kernel
114114
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
115-
TVM_FFI_CHECK_CUDA_ERROR(result);
115+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
116116
}
117117

118118
// Export TVM-FFI functions

examples/cubin_launcher/embedded_cubin/embed_with_tvm_ffi/src/lib_embedded.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
7070

7171
// Launch kernel
7272
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
73-
TVM_FFI_CHECK_CUDA_ERROR(result);
73+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
7474
}
7575

7676
} // namespace cubin_embedded
@@ -109,7 +109,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
109109

110110
// Launch kernel
111111
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
112-
TVM_FFI_CHECK_CUDA_ERROR(result);
112+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
113113
}
114114

115115
// Export TVM-FFI functions

examples/cubin_launcher/embedded_cubin/include_bin2c/src/lib_embedded.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
7070

7171
// Launch kernel
7272
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
73-
TVM_FFI_CHECK_CUDA_ERROR(result);
73+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
7474
}
7575

7676
} // namespace cubin_embedded
@@ -109,7 +109,7 @@ void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
109109

110110
// Launch kernel
111111
tvm::ffi::cuda_api::ResultType result = kernel.Launch(args, grid, block, stream);
112-
TVM_FFI_CHECK_CUDA_ERROR(result);
112+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
113113
}
114114

115115
// Export TVM-FFI functions

examples/cubin_launcher/example_nvrtc_cubin.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
129129
cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
130130
131131
// Launch kernel
132-
cudaError_t result = kernel.Launch(args, grid, block, stream);
133-
TVM_FFI_CHECK_CUDA_ERROR(result);
132+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
134133
}
135134
136135
void MulTwo(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
@@ -158,8 +157,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
158157
cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
159158
160159
// Launch kernel
161-
cudaError_t result = kernel.Launch(args, grid, block, stream);
162-
TVM_FFI_CHECK_CUDA_ERROR(result);
160+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
163161
}
164162
165163
} // namespace nvrtc_loader

examples/cubin_launcher/example_triton_cubin.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,7 @@ def use_cubin_kernel(cubin_bytes: bytes) -> int:
133133
DLDevice device = x.device();
134134
cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, device.device_id));
135135
136-
cudaError_t result = kernel.Launch(args, grid, block, stream);
137-
TVM_FFI_CHECK_CUDA_ERROR(result);
136+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
138137
}
139138
140139
} // namespace triton_loader

include/tvm/ffi/extra/cuda/base.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,31 @@
2323
#ifndef TVM_FFI_EXTRA_CUDA_BASE_H_
2424
#define TVM_FFI_EXTRA_CUDA_BASE_H_
2525

26+
#include <cuda_runtime.h>
27+
#include <tvm/ffi/error.h>
28+
2629
namespace tvm {
2730
namespace ffi {
2831

32+
/*!
33+
* \brief Macro for checking CUDA runtime API errors.
34+
*
35+
* This macro checks the return value of CUDA runtime API calls and throws
36+
* a RuntimeError with detailed error information if the call fails.
37+
*
38+
* \param stmt The CUDA runtime API call to check.
39+
*/
40+
#define TVM_FFI_CHECK_CUDA_ERROR(stmt) \
41+
do { \
42+
cudaError_t __err = (stmt); \
43+
if (__err != cudaSuccess) { \
44+
const char* __err_name = cudaGetErrorName(__err); \
45+
const char* __err_str = cudaGetErrorString(__err); \
46+
TVM_FFI_THROW(RuntimeError) << "CUDA Runtime Error: " << __err_name << " (" \
47+
<< static_cast<int>(__err) << "): " << __err_str; \
48+
} \
49+
} while (0)
50+
2951
/*!
3052
* \brief A simple 3D dimension type for CUDA kernel launch configuration.
3153
*

include/tvm/ffi/extra/cuda/cubin_launcher.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
#ifndef TVM_FFI_EXTRA_CUDA_CUBIN_LAUNCHER_H_
3030
#define TVM_FFI_EXTRA_CUDA_CUBIN_LAUNCHER_H_
3131

32-
#include <cuda.h>
32+
#include <cuda.h> // NOLINT(clang-diagnostic-error)
3333
#include <cuda_runtime.h>
3434
#include <tvm/ffi/error.h>
3535
#include <tvm/ffi/extra/c_env_api.h>
@@ -234,7 +234,7 @@ namespace ffi {
234234
* TVMFFIEnvGetStream(device.device_type, device.device_id));
235235
*
236236
* cudaError_t result = kernel.Launch(args, grid, block, stream);
237-
* TVM_FFI_CHECK_CUDA_ERROR(result);
237+
* TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
238238
* }
239239
* \endcode
240240
*
@@ -295,7 +295,7 @@ class CubinModule {
295295
* \param bytes CUBIN binary data as a Bytes object.
296296
*/
297297
explicit CubinModule(const Bytes& bytes) {
298-
TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, bytes.data()));
298+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, bytes.data()));
299299
}
300300

301301
/*!
@@ -305,7 +305,7 @@ class CubinModule {
305305
* \note The `code` buffer points to an ELF image.
306306
*/
307307
explicit CubinModule(const char* code) {
308-
TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
308+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
309309
}
310310

311311
/*!
@@ -315,7 +315,7 @@ class CubinModule {
315315
* \note The `code` buffer points to an ELF image.
316316
*/
317317
explicit CubinModule(const unsigned char* code) {
318-
TVM_FFI_CHECK_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
318+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::LoadLibrary(&library_, code));
319319
}
320320

321321
/*! \brief Destructor unloads the library */
@@ -418,7 +418,7 @@ class CubinModule {
418418
* // Launch on stream
419419
* cudaStream_t stream = ...;
420420
* cudaError_t result = kernel.Launch(args, grid, block, stream);
421-
* TVM_FFI_CHECK_CUDA_ERROR(result);
421+
* TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(result);
422422
* \endcode
423423
*
424424
* \note This class is movable but not copyable.
@@ -434,7 +434,7 @@ class CubinKernel {
434434
* \param name Name of the kernel function.
435435
*/
436436
CubinKernel(cuda_api::LibraryHandle library, const char* name) {
437-
TVM_FFI_CHECK_CUDA_ERROR(cuda_api::GetKernel(&kernel_, library, name));
437+
TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(cuda_api::GetKernel(&kernel_, library, name));
438438
}
439439

440440
/*! \brief Destructor (kernel handle doesn't need explicit cleanup) */
@@ -464,8 +464,7 @@ class CubinKernel {
464464
* \par Error Checking
465465
* Always check the returned cudaError_t:
466466
* \code{.cpp}
467-
* cudaError_t result = kernel.Launch(args, grid, block, stream);
468-
* TVM_FFI_CHECK_CUDA_ERROR(result);
467+
* TVM_FFI_CHECK_CUBIN_LAUNCHER_CUDA_ERROR(kernel.Launch(args, grid, block, stream));
469468
* \endcode
470469
*
471470
* \param args Array of pointers to kernel arguments (must point to actual values).

0 commit comments

Comments
 (0)