From b768bdb836c52a668d7e646e8b244c241982fb92 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 25 Mar 2026 14:42:17 -0400
Subject: [PATCH] Doc changes from new generator

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |    2 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |    2 +-
 .../cuda/bindings/_bindings/cynvrtc.pxd.in    |    2 +-
 .../cuda/bindings/_bindings/cynvrtc.pyx.in    |    2 +-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |    2 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |    2 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |    2 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |    2 +-
 .../cuda/bindings/_internal/_fast_enum.py     |    2 +-
 .../cuda/bindings/_internal/cufile.pxd        |    4 +-
 .../cuda/bindings/_internal/cufile_linux.pyx  |    4 +-
 .../cuda/bindings/_internal/nvfatbin.pxd      |    2 +-
 .../bindings/_internal/nvfatbin_linux.pyx     |    2 +-
 .../bindings/_internal/nvfatbin_windows.pyx   |    2 +-
 .../cuda/bindings/_internal/nvjitlink.pxd     |    4 +-
 .../bindings/_internal/nvjitlink_linux.pyx    |    4 +-
 .../bindings/_internal/nvjitlink_windows.pyx  |    4 +-
 .../cuda/bindings/_internal/nvml.pxd          |    2 +-
 .../cuda/bindings/_internal/nvml_linux.pyx    |    2 +-
 .../cuda/bindings/_internal/nvml_windows.pyx  |    2 +-
 .../cuda/bindings/_internal/nvvm.pxd          |    4 +-
 .../cuda/bindings/_internal/nvvm_linux.pyx    |    4 +-
 .../cuda/bindings/_internal/nvvm_windows.pyx  |    4 +-
 cuda_bindings/cuda/bindings/cufile.pxd        |    4 +-
 cuda_bindings/cuda/bindings/cufile.pyx        |    4 +-
 cuda_bindings/cuda/bindings/cycufile.pxd      |    4 +-
 cuda_bindings/cuda/bindings/cycufile.pyx      |    4 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |    2 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |    2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |    2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    |    2 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |    4 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |    4 +-
 cuda_bindings/cuda/bindings/cynvml.pxd        |    2 +-
 cuda_bindings/cuda/bindings/cynvml.pyx        |    2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd.in    |    2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in    |    2 +-
 cuda_bindings/cuda/bindings/cynvvm.pxd        |    4 +-
 cuda_bindings/cuda/bindings/cynvvm.pyx        |    4 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |    2 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |    2 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |    2 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |    2 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |   50 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     |  437 +-
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |    2 +-
 cuda_bindings/cuda/bindings/nvfatbin.pyx      |    2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd     |    4 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx     |    4 +-
 cuda_bindings/cuda/bindings/nvml.pxd          |    2 +-
 cuda_bindings/cuda/bindings/nvml.pyx          |   14 +-
 cuda_bindings/cuda/bindings/nvrtc.pxd.in      |    2 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx.in      |    2 +-
 cuda_bindings/cuda/bindings/nvvm.pxd          |    4 +-
 cuda_bindings/cuda/bindings/nvvm.pyx          |    4 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  112 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 2375 ++++----
 cuda_bindings/docs/source/module/driver.rst   |   72 +-
 cuda_bindings/docs/source/module/nvrtc.rst    |    6 +-
 cuda_bindings/docs/source/module/runtime.rst  | 5022 ++++++++---------
 60 files changed, 3903 insertions(+), 4329 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 98298f6947..0b19de67d0 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index a5e73b4e99..e2e966daa2 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
index ebe43bd922..5a53b926d1 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 from cuda.bindings.cynvrtc cimport *
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 6e2a977347..2e1c7a67cc 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 6c04ecd8a0..01936ee0e4 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 1e59279d68..0af0f731ea 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index f941bbf46f..53b30d026a 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index 273d985575..679722b3cc 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
index 9d18d9e50b..556cd33459 100644
--- a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
+++ b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 
 """
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile.pxd b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
index 1a530f78c0..e642486af1 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ..cycufile cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
index 9297e9b455..a49ba1890f 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 import threading
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index 15617f8ad5..c82cc8efb7 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ..cynvfatbin cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index f5a9bbd218..89e5015bc3 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index add15de561..576a2ca9a6 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index 1fdc2a1af7..2d3f432eeb 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ..cynvjitlink cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index f94b0d6aad..8c82349462 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 15c536089b..8a5a7661b4 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml.pxd b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
index 697687c46b..bf400e94fb 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ..cynvml cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
index 2eab899835..943debd6d3 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
index cedae45ada..a2021424ec 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
index 0850fa7633..4d9ff45e70 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ..cynvvm cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 8b851960bb..2a1d155862 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
index 4cb39f55d4..6170157545 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cufile.pxd b/cuda_bindings/cuda/bindings/cufile.pxd
index 843685a22b..4af1265c78 100644
--- a/cuda_bindings/cuda/bindings/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/cufile.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cufile.pyx b/cuda_bindings/cuda/bindings/cufile.pyx
index 87f55ebac7..1045abef95 100644
--- a/cuda_bindings/cuda/bindings/cufile.pyx
+++ b/cuda_bindings/cuda/bindings/cufile.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc cimport errno
diff --git a/cuda_bindings/cuda/bindings/cycufile.pxd b/cuda_bindings/cuda/bindings/cycufile.pxd
index 21be588cff..bcc2e7596d 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pxd
+++ b/cuda_bindings/cuda/bindings/cycufile.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 from libc.time cimport time_t
diff --git a/cuda_bindings/cuda/bindings/cycufile.pyx b/cuda_bindings/cuda/bindings/cycufile.pyx
index 3e959b19d5..4b6e267ada 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pyx
+++ b/cuda_bindings/cuda/bindings/cycufile.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ._internal cimport cufile as _cufile
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 43e09ccd53..6ed16b51ba 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index d1d7efa23b..527fd10d05 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index 3cf5c542e2..197e0bb67c 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 07492e51a9..d382045a2b 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ._internal cimport nvfatbin as _nvfatbin
 
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 16bc50eeff..8e94647775 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index ae736eeb20..240520efa6 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
 
diff --git a/cuda_bindings/cuda/bindings/cynvml.pxd b/cuda_bindings/cuda/bindings/cynvml.pxd
index 2a95fc96c8..ece2801de4 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pxd
+++ b/cuda_bindings/cuda/bindings/cynvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport int64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvml.pyx b/cuda_bindings/cuda/bindings/cynvml.pyx
index 0066265458..857ef4f96a 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pyx
+++ b/cuda_bindings/cuda/bindings/cynvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ._internal cimport nvml as _nvml
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
index 5e5eb68842..209b361c05 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index c9e698ae01..9e64cbf9c1 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings._bindings.cynvrtc as cynvrtc
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pxd b/cuda_bindings/cuda/bindings/cynvvm.pxd
index 9b4348c841..d9d5baf229 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pxd
+++ b/cuda_bindings/cuda/bindings/cynvvm.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pyx b/cuda_bindings/cuda/bindings/cynvvm.pyx
index a779890ce2..ef20c3f122 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pyx
+++ b/cuda_bindings/cuda/bindings/cynvvm.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from ._internal cimport nvvm as _nvvm
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index d2773d7d5f..85108b68a9 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index d5c0e5b7ae..4084ed03ab 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index f52ddcdea0..8cbdb881a7 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index dd24aeb953..fb4a4aabaa 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 0d62335828..a5328c2f5e 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -1926,8 +1926,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -3794,8 +3794,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -3937,8 +3937,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -6923,8 +6923,8 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -7134,8 +7134,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -7265,8 +7265,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -7396,8 +7396,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -7527,8 +7527,8 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -9528,8 +9528,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_SIGN
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -9563,8 +9563,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS(CUDA_EXTERNAL_SEMAPHORE_SIGNAL_
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -9598,8 +9598,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_WAIT_P
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -9633,8 +9633,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS(CUDA_EXTERNAL_SEMAPHORE_WAIT_PARA
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 70f83f4c93..2c02494148 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -850,7 +850,7 @@ class CUstreamBatchMemOpType(_FastEnum):
     CU_STREAM_MEM_OP_ATOMIC_REDUCTION = (
         cydriver.CUstreamBatchMemOpType_enum.CU_STREAM_MEM_OP_ATOMIC_REDUCTION,
         'Perform a atomic reduction. See\n'
-        ':py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction\n'
+        ':py:obj:`~.CUstreamBatchMemOpParams.atomicReduction`\n'
     ){{endif}}
 
 {{endif}}
@@ -4347,16 +4347,15 @@ class CUlaunchAttributeID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -4406,7 +4405,7 @@ class CUlaunchAttributeID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE' in found_values}}
@@ -8306,16 +8305,15 @@ class CUkernelNodeAttrID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -8365,7 +8363,7 @@ class CUkernelNodeAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE' in found_values}}
@@ -8555,16 +8553,15 @@ class CUstreamAttrID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -8614,7 +8611,7 @@ class CUstreamAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE' in found_values}}
@@ -14079,8 +14076,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -19987,8 +19984,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -20367,8 +20364,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -33878,16 +33875,16 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     the CUDA array. For CUDA array to CUDA array copies, the element size
     of the two CUDA arrays must match.
 
-    For a given operand, if :py:obj:`~.CUmemcpy3DOperand`::type is
-    specified as :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_POINTER`, then
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr will be used. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::ptr field must contain the
-    pointer where the copy should begin. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::rowLength field specifies the
+    For a given operand, if :py:obj:`~.CUmemcpy3DOperand.type` is specified
+    as :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_POINTER`, then
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.ptr` field must contain the pointer
+    where the copy should begin. The
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.rowLength` field specifies the
     length of each row in elements and must either be zero or be greater
     than or equal to the width of the copy specified in
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP`::extent::width. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::layerHeight field specifies the
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.layerHeight` field specifies the
     height of each layer and must either be zero or be greater than or
     equal to the height of the copy specified in
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP`::extent::height. When either of
@@ -33897,15 +33894,15 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` is true or
     system-allocated pageable memory on devices where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS` is true, the
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::locHint field can be used to
-    hint the location of the operand.
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.locHint` field can be used to hint
+    the location of the operand.
 
     If an operand's type is specified as
     :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_ARRAY`, then
-    :py:obj:`~.CUmemcpy3DOperand`::op::array will be used. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::array::array field specifies the
-    CUDA array and :py:obj:`~.CUmemcpy3DOperand`::op::array::offset
-    specifies the 3D offset into that array where the copy begins.
+    :py:obj:`~.CUmemcpy3DOperand.op.array` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand.op.array.array` field specifies the CUDA
+    array and :py:obj:`~.CUmemcpy3DOperand.op.array.offset` specifies the
+    3D offset into that array where the copy begins.
 
     The :py:obj:`~.CUmemcpyAttributes.srcAccessOrder` indicates the source
     access ordering to be observed for copies associated with the
@@ -35786,7 +35783,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.cuMemGetAllocationGranularity` with the
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. To create a CPU
     allocation that doesn't target any specific NUMA nodes, applications
-    must set :py:obj:`~.CUmemAllocationProp`::CUmemLocation::type to
+    must set :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`.
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id is ignored for HOST
     allocations. HOST allocations are not IPC capable and
@@ -35794,7 +35791,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     other value will result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. To
     create a CPU allocation targeting a specific host NUMA node,
     applications must set
-    :py:obj:`~.CUmemAllocationProp`::CUmemLocation::type to
+    :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id must specify the
     NUMA ID of the CPU. On systems where NUMA is not available
@@ -35823,7 +35820,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     /proc/devices users can execute the following command: `mknod
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
-    If :py:obj:`~.CUmemAllocationProp`::allocFlags::usage contains
+    If :py:obj:`~.CUmemAllocationProp.allocFlags.usage` contains
     :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL` flag then the memory
     allocation is intended only to be used as backing tile pool for sparse
     CUDA arrays and sparse CUDA mipmapped arrays. (see
@@ -36002,17 +35999,17 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     where :py:obj:`~.CUarrayMapInfo.resourceType` specifies the type of
     resource to be operated on. If :py:obj:`~.CUarrayMapInfo.resourceType`
     is set to :py:obj:`~.CUresourcetype`::CU_RESOURCE_TYPE_ARRAY then
-    :py:obj:`~.CUarrayMapInfo`::resource::array must be set to a valid
-    sparse CUDA array handle. The CUDA array must be either a 2D, 2D
-    layered or 3D CUDA array and must have been allocated using
-    :py:obj:`~.cuArrayCreate` or :py:obj:`~.cuArray3DCreate` with the flag
+    :py:obj:`~.CUarrayMapInfo.resource.array` must be set to a valid sparse
+    CUDA array handle. The CUDA array must be either a 2D, 2D layered or 3D
+    CUDA array and must have been allocated using :py:obj:`~.cuArrayCreate`
+    or :py:obj:`~.cuArray3DCreate` with the flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` or
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`. For CUDA arrays obtained
     using :py:obj:`~.cuMipmappedArrayGetLevel`,
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned. If
     :py:obj:`~.CUarrayMapInfo.resourceType` is set to
     :py:obj:`~.CUresourcetype`::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY then
-    :py:obj:`~.CUarrayMapInfo`::resource::mipmap must be set to a valid
+    :py:obj:`~.CUarrayMapInfo.resource.mipmap` must be set to a valid
     sparse CUDA mipmapped array handle. The CUDA mipmapped array must be
     either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
     allocated using :py:obj:`~.cuMipmappedArrayCreate` with the flag
@@ -36036,26 +36033,25 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
     :py:obj:`~.CUarraySparseSubresourceType`::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
-    then :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel struct must
+    then :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel` struct must
     contain valid array subregion offsets and extents. The
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetX,
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetY and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetZ must
-    specify valid X, Y and Z offsets respectively. The
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentWidth,
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentHeight and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentDepth must
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetX`,
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetY` and
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must specify
+    valid X, Y and Z offsets respectively. The
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentWidth`,
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentHeight` and
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
     specify valid width, height and depth extents respectively. These
     offsets and extents must be aligned to the corresponding tile
     dimension. For CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::level must
-    specify a valid mip level index. Otherwise, must be zero. For layered
-    CUDA arrays and layered CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::layer must
-    specify a valid layer index. Otherwise, must be zero.
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetZ must be
-    zero and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentDepth must
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.level` must specify a
+    valid mip level index. Otherwise, must be zero. For layered CUDA arrays
+    and layered CUDA mipmapped arrays
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.layer` must specify a
+    valid layer index. Otherwise, must be zero.
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must be zero
+    and :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
     be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped
     arrays. Tile extents can be obtained by calling
     :py:obj:`~.cuArrayGetSparseProperties` and
@@ -36063,23 +36059,23 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
     :py:obj:`~.CUarraySparseSubresourceType`::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
-    then :py:obj:`~.CUarrayMapInfo`::subresource::miptail struct must
-    contain valid mip tail offset in
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::offset and size in
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::size. Both, mip tail
+    then :py:obj:`~.CUarrayMapInfo.subresource.miptail` struct must contain
+    valid mip tail offset in
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.offset` and size in
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.size`. Both, mip tail
     offset and mip tail size must be aligned to the tile size. For layered
     CUDA mipmapped arrays which don't have the flag
     :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL` set in
     :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES.flags` as returned by
     :py:obj:`~.cuMipmappedArrayGetSparseProperties`,
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::layer must specify a
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.layer` must specify a
     valid layer index. Otherwise, must be zero.
 
-    If :py:obj:`~.CUarrayMapInfo`::resource::array or
-    :py:obj:`~.CUarrayMapInfo`::resource::mipmap was created with
+    If :py:obj:`~.CUarrayMapInfo.resource.array` or
+    :py:obj:`~.CUarrayMapInfo.resource.mipmap` was created with
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING` flag set the
     :py:obj:`~.CUarrayMapInfo.subresourceType` and the contents of
-    :py:obj:`~.CUarrayMapInfo`::subresource will be ignored.
+    :py:obj:`~.CUarrayMapInfo.subresource` will be ignored.
 
     :py:obj:`~.CUarrayMapInfo.memOperationType` specifies the type of
     operation. :py:obj:`~.CUmemOperationType` is defined as:
@@ -36089,7 +36085,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_MAP then the
     subresource will be mapped onto the tile pool memory specified by
-    :py:obj:`~.CUarrayMapInfo`::memHandle at offset
+    :py:obj:`~.CUarrayMapInfo.memHandle` at offset
     :py:obj:`~.CUarrayMapInfo.offset`. The tile pool allocation has to be
     created by specifying the :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL`
     flag when calling :py:obj:`~.cuMemCreate`. Also,
@@ -36098,7 +36094,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_UNMAP then an
-    unmapping operation is performed. :py:obj:`~.CUarrayMapInfo`::memHandle
+    unmapping operation is performed. :py:obj:`~.CUarrayMapInfo.memHandle`
     must be NULL.
 
     :py:obj:`~.CUarrayMapInfo.deviceBitMask` specifies the list of devices
@@ -36108,7 +36104,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_MAP, the device
     must also match the device associated with the tile pool memory
-    allocation as specified by :py:obj:`~.CUarrayMapInfo`::memHandle.
+    allocation as specified by :py:obj:`~.CUarrayMapInfo.memHandle`.
 
     :py:obj:`~.CUarrayMapInfo.flags` and
     :py:obj:`~.CUarrayMapInfo.reserved`[] are unused and must be set to
@@ -38374,7 +38370,7 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
 
     Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` for
     :py:obj:`~.CUmemLocation.type` will prefetch memory to GPU specified by
-    device ordinal :py:obj:`~.CUmemLocation`::id which must have non-zero
+    device ordinal :py:obj:`~.CUmemLocation.id` which must have non-zero
     value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`.
     Additionally, `hStream` must be associated with a device that has a
@@ -38385,14 +38381,14 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
     memory to a specific host NUMA node by specifying
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` for
     :py:obj:`~.CUmemLocation.type` and a valid host NUMA node id in
-    :py:obj:`~.CUmemLocation`::id Users can also request prefetching memory
+    :py:obj:`~.CUmemLocation.id` Users can also request prefetching memory
     to the host NUMA node closest to the current thread's CPU by specifying
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` for
     :py:obj:`~.CUmemLocation.type`. Note when
     :py:obj:`~.CUmemLocation.type` is etiher
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` OR
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT`,
-    :py:obj:`~.CUmemLocation`::id will be ignored.
+    :py:obj:`~.CUmemLocation.id` will be ignored.
 
     The start address and end address of the memory range will be rounded
     down and rounded up respectively to be aligned to CPU page size before
@@ -38545,19 +38541,19 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
     - :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION`: This advice sets
       the preferred location for the data to be the memory belonging to
       `location`. When :py:obj:`~.CUmemLocation.type` is
-      :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, :py:obj:`~.CUmemLocation`::id
+      :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, :py:obj:`~.CUmemLocation.id`
       is ignored and the preferred location is set to be host memory. To
       set the preferred location to a specific host NUMA node, applications
       must set :py:obj:`~.CUmemLocation.type` to
       :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
-      :py:obj:`~.CUmemLocation`::id must specify the NUMA ID of the host
+      :py:obj:`~.CUmemLocation.id` must specify the NUMA ID of the host
       NUMA node. If :py:obj:`~.CUmemLocation.type` is set to
       :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT`,
-      :py:obj:`~.CUmemLocation`::id will be ignored and the the host NUMA
+      :py:obj:`~.CUmemLocation.id` will be ignored and the the host NUMA
       node closest to the calling thread's CPU will be used as the
       preferred location. If :py:obj:`~.CUmemLocation.type` is a
       :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, then
-      :py:obj:`~.CUmemLocation`::id must be a valid device ordinal and the
+      :py:obj:`~.CUmemLocation.id` must be a valid device ordinal and the
       device must have a non-zero value for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Setting
       the preferred location does not cause data to migrate to that
@@ -38584,7 +38580,7 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`. If the memory region
       refers to valid system-allocated pageable memory, and
       :py:obj:`~.CUmemLocation.type` is CU_MEM_LOCATION_TYPE_DEVICE then
-      :py:obj:`~.CUmemLocation`::id must be a valid device that has a non-
+      :py:obj:`~.CUmemLocation.id` must be a valid device that has a non-
       zero alue for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`.
 
@@ -38597,11 +38593,11 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       the data will be accessed by processor `location`. The
       :py:obj:`~.CUmemLocation.type` must be either
       :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` with
-      :py:obj:`~.CUmemLocation`::id representing a valid device ordinal or
+      :py:obj:`~.CUmemLocation.id` representing a valid device ordinal or
       :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` and
-      :py:obj:`~.CUmemLocation`::id will be ignored. All other location
-      types are invalid. If :py:obj:`~.CUmemLocation`::id is a GPU, then
-      the device attribute
+      :py:obj:`~.CUmemLocation.id` will be ignored. All other location
+      types are invalid. If :py:obj:`~.CUmemLocation.id` is a GPU, then the
+      device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` must be
       non-zero. This advice does not cause data migration and has no impact
       on the location of the data per se. Instead, it causes the data to
@@ -38630,10 +38626,10 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       policies of this advice. If the memory region refers to valid system-
       allocated pageable memory, and :py:obj:`~.CUmemLocation.type` is
       :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` then device in
-      :py:obj:`~.CUmemLocation`::id must have a non-zero value for the
+      :py:obj:`~.CUmemLocation.id` must have a non-zero value for the
       device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Additionally,
-      if :py:obj:`~.CUmemLocation`::id has a non-zero value for the device
+      if :py:obj:`~.CUmemLocation.id` has a non-zero value for the device
       attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`,
       then this call has no effect.
@@ -38644,10 +38640,10 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       in non-fatal page faults. If the memory region refers to valid
       system-allocated pageable memory, and :py:obj:`~.CUmemLocation.type`
       is :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` then device in
-      :py:obj:`~.CUmemLocation`::id must have a non-zero value for the
+      :py:obj:`~.CUmemLocation.id` must have a non-zero value for the
       device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Additionally,
-      if :py:obj:`~.CUmemLocation`::id has a non-zero value for the device
+      if :py:obj:`~.CUmemLocation.id` has a non-zero value for the device
       attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`,
       then this call has no effect.
@@ -41463,89 +41459,84 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::fd must be a
-    valid file descriptor referencing a memory object. Ownership of the
-    file descriptor is transferred to the CUDA driver when the handle is
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.fd` must be a valid
+    file descriptor referencing a memory object. Ownership of the file
+    descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
     after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32`, then exactly
-    one of
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
+    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
+    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a memory object. Ownership of this handle is not transferred
-    to CUDA after the import operation, so the application must release the
-    handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    memory object. Ownership of this handle is not transferred to CUDA
+    after the import operation, so the application must release the handle
+    using the appropriate system call. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a memory object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
-    be NULL. The handle specified must be a globally shared KMT handle.
-    This handle does not hold a reference to the underlying object, and
-    thus will be invalid when all references to the memory object are
-    destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    be non-NULL and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
+    NULL. The handle specified must be a globally shared KMT handle. This
+    handle does not hold a reference to the underlying object, and thus
+    will be invalid when all references to the memory object are destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP`, then exactly one
-    of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Heap object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Heap
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Heap object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE`, then exactly
-    one of
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
+    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
+    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Resource object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Resource
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must represent a valid shared NT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    represent a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a ID3D11Resource
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D11Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must represent a valid shared KMT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    represent a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a ID3D11Resource
     object and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
-    be NULL.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
+    NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::nvSciBufObject
-    must be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.nvSciBufObject` must
+    be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
     object imported into CUDA is also mapped by other drivers, then the
     application must use :py:obj:`~.cuWaitExternalSemaphoresAsync` or
     :py:obj:`~.cuSignalExternalSemaphoresAsync` as appropriate barriers to
@@ -41556,8 +41547,8 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::fd must be a
-    valid file descriptor referencing a dma_buf object and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.fd` must be a valid
+    file descriptor referencing a dma_buf object and
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.flags` must be zero.
     Importing a dma_buf object is supported only on Tegra Jetson platform
     starting with Thor series. Mapping an imported dma_buf object as CUDA
@@ -41813,7 +41804,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::fd must be a
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be a
     valid file descriptor referencing a synchronization object. Ownership
     of the file descriptor is transferred to the CUDA driver when the
     handle is imported successfully. Performing any operations on the file
@@ -41822,98 +41813,95 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32`, then
     exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     must be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must be NULL. The handle specified must be a globally shared KMT
-    handle. This handle does not hold a reference to the underlying object,
-    and thus will be invalid when all references to the synchronization
-    object are destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    be NULL. The handle specified must be a globally shared KMT handle.
+    This handle does not hold a reference to the underlying object, and
+    thus will be invalid when all references to the synchronization object
+    are destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE`, then exactly
     one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that is
     returned by ID3D12Device::CreateSharedHandle when referring to a
     ID3D12Fence object. This handle holds a reference to the underlying
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid ID3D12Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared NT handle that is returned by
     ID3D11Fence::CreateSharedHandle. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid ID3D11Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::nvSciSyncObj
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.nvSciSyncObj`
     represents a valid NvSciSyncObj.
 
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a IDXGIKeyedMutex
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid IDXGIKeyedMutex object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`,
     then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a IDXGIKeyedMutex
     object and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must be NULL.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    be NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
-    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::fd must
-    be a valid file descriptor referencing a synchronization object.
-    Ownership of the file descriptor is transferred to the CUDA driver when
-    the handle is imported successfully. Performing any operations on the
-    file descriptor after it is imported results in undefined behavior.
+    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be
+    a valid file descriptor referencing a synchronization object. Ownership
+    of the file descriptor is transferred to the CUDA driver when the
+    handle is imported successfully. Performing any operations on the file
+    descriptor after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`,
     then exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object.
 
     Parameters
@@ -41966,15 +41954,15 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then the semaphore will be set to the value specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::fence::value.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` this API sets
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
     to a value that can be used by subsequent waiters of the same NvSciSync
     object to order operations with those currently submitted in `stream`.
     Such an update will overwrite previous contents of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
     external memory objects that are imported as
@@ -42123,12 +42111,12 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then waiting on the semaphore will wait until the value of the
     semaphore is greater than or equal to
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::fence::value.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` then, waiting
     on the semaphore will wait until the
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
     is signaled by the signaler of the NvSciSyncObj that was associated
     with this semaphore object. By default, waiting on such an external
     semaphore object causes appropriate memory synchronization operations
@@ -42152,9 +42140,9 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`
     then the keyed mutex will be acquired when it is released with the key
     specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::keyedmutex::key
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.key`
     or until the timeout specified by
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::keyedmutex::timeoutMs
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.timeoutMs`
     has lapsed. The timeout interval can either be a finite value specified
     in milliseconds or an infinite value. In case an infinite value is
     specified the timeout never elapses. The windows INFINITE macro must be
@@ -43424,7 +43412,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     other than 0 or 1 is not allowed.
 
     On success, a handle will be returned via
-    :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode
+    :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode`
     which can be passed to the various device-side update functions to
     update the node's kernel parameters from within another kernel. For
     more information on the types of device updates that can be made, as
@@ -44433,7 +44421,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Notes
     -----
-    In certain cases where cubins are created with no ABI (i.e., using `ptxas` `None` `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
+    In certain cases where cubins are created with no ABI (i.e., using `ptxas` `--abi-compile` `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -51772,23 +51760,23 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::array::hArray must be set to a
-    valid CUDA array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
+    CUDA array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::mipmap::hMipmappedArray must be
-    set to a valid CUDA mipmapped array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.mipmap.hMipmappedArray` must be set
+    to a valid CUDA mipmapped array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_LINEAR`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::format and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::numChannels describe the
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.format` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.numChannels` describe the
     format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::sizeInBytes
+    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.sizeInBytes`
     specifies the size of the array in bytes. The total number of elements
     in the linear address range cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`. The
@@ -51797,20 +51785,19 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_PITCH2D`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::devPtr must be set to a
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.devPtr` must be set to a
     valid device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::format and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::numChannels describe the
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.format` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.numChannels` describe the
     format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::width and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::height specify the width
-    and height of the array in elements, and cannot exceed
+    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.width` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.height` specify the width and
+    height of the array in elements, and cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH` and
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`
-    respectively.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::pitchInBytes specifies
-    the pitch between two rows in bytes and has to be aligned to
+    respectively. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.pitchInBytes`
+    specifies the pitch between two rows in bytes and has to be aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`. Pitch cannot
     exceed :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`.
 
@@ -52149,9 +52136,9 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     describes the data to perform surface load/stores on.
     :py:obj:`~.CUDA_RESOURCE_DESC.resType` must be
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY` and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::array::hArray must be set to a
-    valid CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be
-    set to zero.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
+    CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be set to
+    zero.
 
     Surface objects are only supported on devices of compute capability 3.0
     or higher. Additionally, a surface object is an opaque value, and, as
@@ -55185,8 +55172,8 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of `None` resources. Can be NULL to query the number
-        of groups.
+        Output array of `CUdevResource` resources. Can be NULL to query the
+        number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
@@ -55258,8 +55245,8 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
 
     For a valid call:
 
-    - `result` should point to a `None` array of size `nbGroups`, or
-      alternatively, may be NULL, if the developer wishes for only the
+    - `result` should point to a `CUdevResource` array of size `nbGroups`,
+      or alternatively, may be NULL, if the developer wishes for only the
       groupParams entries to be updated
 
     - `input` should be a valid :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
@@ -55350,8 +55337,8 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of `None` resources. Can be NULL, alongside an smCount
-        of 0, for discovery purpose.
+        Output array of `CUdevResource` resources. Can be NULL, alongside
+        an smCount of 0, for discovery purpose.
     remainder : :py:obj:`~.CUdevResource`
         If splitting the input resource leaves any SMs, the remainder is
         placed in here.
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index b117da600c..b9836e831e 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index d11f737874..6e02502dbb 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index c1705420b2..7da795a45f 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index aa8ce232d4..33482377b4 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/nvml.pxd b/cuda_bindings/cuda/bindings/nvml.pxd
index f75b26edeb..48f7751e50 100644
--- a/cuda_bindings/cuda/bindings/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvml.pyx b/cuda_bindings/cuda/bindings/nvml.pyx
index 95c408e7f6..916941c746 100644
--- a/cuda_bindings/cuda/bindings/nvml.pyx
+++ b/cuda_bindings/cuda/bindings/nvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -528,7 +528,7 @@ class Return(_FastEnum):
     See `nvmlReturn_t`.
     """
     SUCCESS = (NVML_SUCCESS, 'The operation was successful.')
-    ERROR_UNINITIALIZED = (NVML_ERROR_UNINITIALIZED, 'NVML was not first initialized with nvmlInit()')
+    ERROR_UNINITIALIZED = (NVML_ERROR_UNINITIALIZED, 'NVML was not first initialized with `nvmlInit()`')
     ERROR_INVALID_ARGUMENT = (NVML_ERROR_INVALID_ARGUMENT, 'A supplied argument is invalid.')
     ERROR_NOT_SUPPORTED = (NVML_ERROR_NOT_SUPPORTED, 'The requested operation is not available on target device.')
     ERROR_NO_PERMISSION = (NVML_ERROR_NO_PERMISSION, 'The current user does not have permission for operation.')
@@ -759,7 +759,7 @@ class FBCSessionType(_FastEnum):
 class DetachGpuState(_FastEnum):
     """
     Is the GPU device to be removed from the kernel by
-    nvmlDeviceRemoveGpu()
+    `nvmlDeviceRemoveGpu()`
 
     See `nvmlDetachGpuState_t`.
     """
@@ -768,7 +768,7 @@ class DetachGpuState(_FastEnum):
 
 class PcieLinkState(_FastEnum):
     """
-    Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu()
+    Parent bridge PCIe link state requested by `nvmlDeviceRemoveGpu()`
 
     See `nvmlPcieLinkState_t`.
     """
@@ -19542,7 +19542,7 @@ cdef class PRMCounter_v1:
 
     @property
     def counter_id(self):
-        """Union[~_numpy.uint32, int]: Counter ID, one of nvmlPRMCounterId_t."""
+        """Union[~_numpy.uint32, int]: Counter ID, one of `nvmlPRMCounterId_t`."""
         if self._data.size == 1:
             return int(self._data.counter_id[0])
         return self._data.counter_id
@@ -20868,7 +20868,7 @@ cpdef init_v2():
 
 
 cpdef init_with_flags(unsigned int flags):
-    """nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values modifying the behaviour of nvmlInit(). Other than the "flags" parameter it is completely similar to ``nvmlInit_v2``.
+    """nvmlInitWithFlags is a variant of ``nvmlInit()``, that allows passing a set of boolean values modifying the behaviour of ``nvmlInit()``. Other than the "flags" parameter it is completely similar to ``nvmlInit_v2``.
 
     Args:
         flags (unsigned int): behaviour modifier flags.
@@ -24411,7 +24411,7 @@ cpdef unsigned int device_get_vgpu_capabilities(intptr_t device, int capability)
 
 
 cpdef str vgpu_type_get_class(unsigned int vgpu_type_id):
-    """Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). See ``nvmlConstants.NVML_DEVICE_NAME_BUFFER_SIZE``.
+    """Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). See nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
 
     Args:
         vgpu_type_id (unsigned int): Handle to vGPU type.
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd.in b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
index f6cd88a3c9..394bcccff0 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 7874cf7bc0..2f50afa726 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
diff --git a/cuda_bindings/cuda/bindings/nvvm.pxd b/cuda_bindings/cuda/bindings/nvvm.pxd
index 87980f0f0f..4cf37a3464 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/nvvm.pxd
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index bcce619351..3cf4254561 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1406+gd8426ea19.d20260316. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 2a54d3f675..5cccc06e6f 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 8797618. Do not modify it directly.
+# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
@@ -511,7 +511,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ::make_cudaPitchedPtr
+    CUDA Pitched memory pointer  make_cudaPitchedPtr
 
     Attributes
     ----------
@@ -547,7 +547,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ::make_cudaExtent
+    CUDA extent  make_cudaExtent
 
     Attributes
     ----------
@@ -577,7 +577,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ::make_cudaPos
+    CUDA 3D position  make_cudaPos
 
     Attributes
     ----------
@@ -3817,9 +3817,9 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -4017,8 +4017,8 @@ cdef class cudaLaunchMemSyncDomainMap_st:
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -4176,8 +4176,7 @@ cdef class anon_struct21:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -4197,7 +4196,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
@@ -4225,9 +4224,10 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -4237,7 +4237,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -4252,19 +4252,19 @@ cdef class cudaLaunchAttributeValue:
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -4835,9 +4835,9 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -4922,8 +4922,8 @@ cdef class cudaLaunchMemSyncDomainMap(cudaLaunchMemSyncDomainMap_st):
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -4998,8 +4998,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
 
 cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -5019,7 +5018,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
@@ -5047,9 +5046,10 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5059,7 +5059,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -5074,19 +5074,19 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -5130,8 +5130,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
 
 cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -5151,7 +5150,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
@@ -5179,9 +5178,10 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5191,7 +5191,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -5206,19 +5206,19 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 88909a4e55..279a8c1759 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1599,41 +1599,41 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -1646,11 +1646,11 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -1671,28 +1671,28 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -1726,7 +1726,7 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -1781,7 +1781,7 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -1807,7 +1807,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -1816,8 +1816,8 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
-        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
+        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -6542,41 +6542,41 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6589,11 +6589,11 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6614,28 +6614,28 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6669,7 +6669,7 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6724,7 +6724,7 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -6750,7 +6750,7 @@ class cudaStreamAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -6759,8 +6759,8 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
-        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
+        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -6790,41 +6790,41 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6837,11 +6837,11 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6862,28 +6862,28 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6917,7 +6917,7 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6972,7 +6972,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -6998,7 +6998,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -7007,8 +7007,8 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
-        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
+        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -8260,7 +8260,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ::make_cudaPitchedPtr
+    CUDA Pitched memory pointer  make_cudaPitchedPtr
 
     Attributes
     ----------
@@ -8365,7 +8365,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ::make_cudaExtent
+    CUDA extent  make_cudaExtent
 
     Attributes
     ----------
@@ -8452,7 +8452,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ::make_cudaPos
+    CUDA 3D position  make_cudaPos
 
     Attributes
     ----------
@@ -18278,9 +18278,9 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -18894,8 +18894,8 @@ cdef class cudaLaunchMemSyncDomainMap_st:
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -19375,8 +19375,7 @@ cdef class anon_struct21:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -19396,7 +19395,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
@@ -19424,9 +19423,10 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -19436,7 +19436,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -19451,19 +19451,19 @@ cdef class cudaLaunchAttributeValue:
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -21243,31 +21243,7 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
 
 @cython.embedsignature(True)
 def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDesc], int device):
-    """ Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
-
-    Returns in `maxWidthInElements` the maximum number of elements
-    allocatable in a 1D linear texture for given format descriptor
-    `fmtDesc`.
-
-    Parameters
-    ----------
-    fmtDesc : :py:obj:`~.cudaChannelFormatDesc`
-        Texture format description.
-    None : int
-        None
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
-    maxWidthInElements : int
-        Returns maximum number of texture elements allocatable for given
-        `fmtDesc`.
-
-    See Also
-    --------
-    :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
-    """
+    """"""
     cdef size_t maxWidthInElements = 0
     cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = fmtDesc._pvt_ptr if fmtDesc is not None else NULL
     with nogil:
@@ -21281,7 +21257,13 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
 
 @cython.embedsignature(True)
 def cudaDeviceGetCacheConfig():
-    """ Returns the preferred cache configuration for the current device.
+    """ Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
+
+    Returns in `maxWidthInElements` the maximum number of elements
+    allocatable in a 1D linear texture for given format descriptor
+    `fmtDesc`.
+
+    Returns the preferred cache configuration for the current device.
 
     On devices where the L1 cache and shared memory use the same hardware
     resources, this returns through `pCacheConfig` the preferred cache
@@ -21311,12 +21293,16 @@ def cudaDeviceGetCacheConfig():
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`
-    pCacheConfig : :py:obj:`~.cudaFuncCache`
-        Returned cache configuration
+    maxWidthInElements : :py:obj:`~.cudaFuncCache`
+        Returns maximum number of texture elements allocatable for given
+        `fmtDesc`.
 
     See Also
     --------
+    :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
+
     :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxGetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache pCacheConfig
@@ -21788,38 +21774,7 @@ def cudaIpcCloseMemHandle(devPtr):
 
 @cython.embedsignature(True)
 def cudaDeviceFlushGPUDirectRDMAWrites(target not None : cudaFlushGPUDirectRDMAWritesTarget, scope not None : cudaFlushGPUDirectRDMAWritesScope):
-    """ Blocks until remote writes are visible to the specified scope.
-
-    Blocks until remote writes to the target context via mappings created
-    through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
-    https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
-    visible to the specified scope.
-
-    If the scope equals or lies within the scope indicated by
-    :py:obj:`~.cudaDevAttrGPUDirectRDMAWritesOrdering`, the call will be a
-    no-op and can be safely omitted for performance. This can be determined
-    by comparing the numerical values between the two enums, with smaller
-    scopes having smaller values.
-
-    Users may query support for this API via
-    :py:obj:`~.cudaDevAttrGPUDirectRDMAFlushWritesOptions`.
-
-    Parameters
-    ----------
-    target : :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
-        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
-    scope : :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
-        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`,
-
-    See Also
-    --------
-    :py:obj:`~.cuFlushGPUDirectRDMAWrites`
-    """
+    """"""
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesTarget cytarget = int(target)
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesScope cyscope = int(scope)
     with nogil:
@@ -21843,7 +21798,23 @@ cdef void cudaAsyncNotificationCallbackWrapper(cyruntime.cudaAsyncNotificationIn
 
 @cython.embedsignature(True)
 def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
-    """ Registers a callback function to receive async notifications.
+    """ Blocks until remote writes are visible to the specified scope.
+
+    Blocks until remote writes to the target context via mappings created
+    through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
+    https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+    visible to the specified scope.
+
+    If the scope equals or lies within the scope indicated by
+    :py:obj:`~.cudaDevAttrGPUDirectRDMAWritesOrdering`, the call will be a
+    no-op and can be safely omitted for performance. This can be determined
+    by comparing the numerical values between the two enums, with smaller
+    scopes having smaller values.
+
+    Users may query support for this API via
+    :py:obj:`~.cudaDevAttrGPUDirectRDMAFlushWritesOptions`.
+
+    Registers a callback function to receive async notifications
 
     Registers `callbackFunc` to receive async notifications.
 
@@ -21865,23 +21836,25 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
 
     Parameters
     ----------
-    device : int
+    target : int
+        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+    scope : :py:obj:`~.cudaAsyncCallback`
+        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+    device : Any
         The device on which to register the callback
-    callbackFunc : :py:obj:`~.cudaAsyncCallback`
-        The function to register as a callback
-    userData : Any
-        A generic pointer to user data. This is passed into the callback
-        function.
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`,
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorNotSupported` :py:obj:`~.cudaErrorInvalidDevice` :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotPermitted` :py:obj:`~.cudaErrorUnknown`
-    callback : :py:obj:`~.cudaAsyncCallbackHandle_t`
-        A handle representing the registered callback instance
+    callbackFunc : :py:obj:`~.cudaAsyncCallbackHandle_t`
+        The function to register as a callback
 
     See Also
     --------
+    :py:obj:`~.cuFlushGPUDirectRDMAWrites`
+
     :py:obj:`~.cudaDeviceUnregisterAsyncNotification`
     """
     cdef cyruntime.cudaAsyncCallback cycallbackFunc
@@ -22507,8 +22480,8 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 
     - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if `device`
       is GA10X+. NvSciSyncAttrKey_GpuId is set to the same UUID that is
-      returned in `None` from :py:obj:`~.cudaDeviceGetProperties` for this
-      `device`.
+      returned in `cudaDeviceProp.uuid` from
+      :py:obj:`~.cudaDeviceGetProperties` for this `device`.
 
     :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeviceUninitialized`,
     :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`,
@@ -24529,49 +24502,7 @@ def cudaEventRecord(event, stream):
 
 @cython.embedsignature(True)
 def cudaEventRecordWithFlags(event, stream, unsigned int flags):
-    """ Records an event.
-
-    Captures in `event` the contents of `stream` at the time of this call.
-    `event` and `stream` must be on the same CUDA context. Calls such as
-    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
-    then examine or wait for completion of the work that was captured. Uses
-    of `stream` after this call do not modify `event`. See note on default
-    stream behavior for what is captured in the default case.
-
-    :py:obj:`~.cudaEventRecordWithFlags()` can be called multiple times on
-    the same event and will overwrite the previously captured state. Other
-    APIs such as :py:obj:`~.cudaStreamWaitEvent()` use the most recently
-    captured state at the time of the API call, and are not affected by
-    later calls to :py:obj:`~.cudaEventRecordWithFlags()`. Before the first
-    call to :py:obj:`~.cudaEventRecordWithFlags()`, an event represents an
-    empty set of work, so for example :py:obj:`~.cudaEventQuery()` would
-    return :py:obj:`~.cudaSuccess`.
-
-    flags include:
-
-    - :py:obj:`~.cudaEventRecordDefault`: Default event creation flag.
-
-    - :py:obj:`~.cudaEventRecordExternal`: Event is captured in the graph
-      as an external event node when performing stream capture.
-
-    Parameters
-    ----------
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to record
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to record event
-    flags : unsigned int
-        Parameters for the operation(See above)
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
-
-    See Also
-    --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecord`,
-    """
+    """"""
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
         pstream = 0
@@ -24597,7 +24528,32 @@ def cudaEventRecordWithFlags(event, stream, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaEventQuery(event):
-    """ Queries an event's status.
+    """ Records an event.
+
+    Captures in `event` the contents of `stream` at the time of this call.
+    `event` and `stream` must be on the same CUDA context. Calls such as
+    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
+    then examine or wait for completion of the work that was captured. Uses
+    of `stream` after this call do not modify `event`. See note on default
+    stream behavior for what is captured in the default case.
+
+    :py:obj:`~.cudaEventRecordWithFlags()` can be called multiple times on
+    the same event and will overwrite the previously captured state. Other
+    APIs such as :py:obj:`~.cudaStreamWaitEvent()` use the most recently
+    captured state at the time of the API call, and are not affected by
+    later calls to :py:obj:`~.cudaEventRecordWithFlags()`. Before the first
+    call to :py:obj:`~.cudaEventRecordWithFlags()`, an event represents an
+    empty set of work, so for example :py:obj:`~.cudaEventQuery()` would
+    return :py:obj:`~.cudaSuccess`.
+
+    flags include:
+
+    - :py:obj:`~.cudaEventRecordDefault`: Default event creation flag.
+
+    - :py:obj:`~.cudaEventRecordExternal`: Event is captured in the graph
+      as an external event node when performing stream capture.
+
+    Queries an event's status
 
     Queries the status of all work currently captured by `event`. See
     :py:obj:`~.cudaEventRecord()` for details on what is captured by an
@@ -24614,15 +24570,18 @@ def cudaEventQuery(event):
     Parameters
     ----------
     event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to query
+        Event to record
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecord`,
+
     :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cuEventQuery`
     """
     cdef cyruntime.cudaEvent_t cyevent
@@ -25612,8 +25571,8 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
     possible, but it is free to choose a different configuration if
     required to execute `func`.
 
-    `func` is a device function symbol and must be declared as a `None`
-    function. If the specified function does not exist, then
+    `func` is a device function symbol and must be declared as a
+    `__global__` function. If the specified function does not exist, then
     :py:obj:`~.cudaErrorInvalidDeviceFunction` is returned. For templated
     functions, pass the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
@@ -25675,8 +25634,8 @@ def cudaFuncGetAttributes(func):
 
     This function obtains the attributes of a function specified via
     `func`. `func` is a device function symbol and must be declared as a
-    `None` function. The fetched attributes are placed in `attr`. If the
-    specified function does not exist, then it is assumed to be a
+    `__global__` function. The fetched attributes are placed in `attr`. If
+    the specified function does not exist, then it is assumed to be a
     :py:obj:`~.cudaKernel_t` and used as is. For templated functions, pass
     the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
@@ -25721,11 +25680,11 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
     This function sets the attributes of a function specified via `func`.
     The parameter `func` must be a pointer to a function that executes on
     the device. The parameter specified by `func` must be declared as a
-    `None` function. The enumeration defined by `attr` is set to the value
-    defined by `value`. If the specified function does not exist, then it
-    is assumed to be a :py:obj:`~.cudaKernel_t` and used as is. If the
-    specified attribute cannot be written, or if the value is incorrect,
-    then :py:obj:`~.cudaErrorInvalidValue` is returned.
+    `__global__` function. The enumeration defined by `attr` is set to the
+    value defined by `value`. If the specified function does not exist,
+    then it is assumed to be a :py:obj:`~.cudaKernel_t` and used as is. If
+    the specified attribute cannot be written, or if the value is
+    incorrect, then :py:obj:`~.cudaErrorInvalidValue` is returned.
 
     Valid values for `attr` are:
 
@@ -27125,7 +27084,7 @@ def cudaMalloc3D(extent not None : cudaExtent):
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaPitchedPtr, make_cudaExtent, :py:obj:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
     with nogil:
@@ -27247,7 +27206,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
@@ -27373,7 +27332,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
@@ -27415,7 +27374,7 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayGetLevel`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayGetLevel`
     """
     cdef cyruntime.cudaMipmappedArray_const_t cymipmappedArray
     if mipmappedArray is None:
@@ -27509,7 +27468,7 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3D`
     """
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p is not None else NULL
     with nogil:
@@ -27643,7 +27602,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3DAsync`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -27953,42 +27912,7 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
 
 @cython.embedsignature(True)
 def cudaArrayGetSparseProperties(array):
-    """ Returns the layout properties of a sparse CUDA array.
-
-    Returns the layout properties of a sparse CUDA array in
-    `sparseProperties`. If the CUDA array is not allocated with flag
-    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
-    returned.
-
-    If the returned value in :py:obj:`~.cudaArraySparseProperties.flags`
-    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` represents the total
-    size of the array. Otherwise, it will be zero. Also, the returned value
-    in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
-    zero. Note that the `array` must have been allocated using
-    :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
-    arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
-    :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
-    :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
-    obtain the sparse properties of the entire CUDA mipmapped array to
-    which `array` belongs to.
-
-    Parameters
-    ----------
-    array : :py:obj:`~.cudaArray_t`
-        The CUDA array to get the sparse properties of
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
-
-    See Also
-    --------
-    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
-    """
+    """"""
     cdef cyruntime.cudaArray_t cyarray
     if array is None:
         parray = 0
@@ -28009,42 +27933,7 @@ def cudaArrayGetSparseProperties(array):
 
 @cython.embedsignature(True)
 def cudaMipmappedArrayGetSparseProperties(mipmap):
-    """ Returns the layout properties of a sparse CUDA mipmapped array.
-
-    Returns the sparse array layout properties in `sparseProperties`. If
-    the CUDA mipmapped array is not allocated with flag
-    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
-    returned.
-
-    For non-layered CUDA mipmapped arrays,
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` returns the size of
-    the mip tail region. The mip tail region includes all mip levels whose
-    width, height or depth is less than that of the tile. For layered CUDA
-    mipmapped arrays, if :py:obj:`~.cudaArraySparseProperties.flags`
-    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies the size of
-    the mip tail of all layers combined. Otherwise,
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies mip tail
-    size per layer. The returned value of
-    :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is valid only
-    if :py:obj:`~.cudaArraySparseProperties.miptailSize` is non-zero.
-
-    Parameters
-    ----------
-    mipmap : :py:obj:`~.cudaMipmappedArray_t`
-        The CUDA mipmapped array to get the sparse properties of
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return :py:obj:`~.cudaArraySparseProperties`
-
-    See Also
-    --------
-    :py:obj:`~.cudaArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
-    """
+    """"""
     cdef cyruntime.cudaMipmappedArray_t cymipmap
     if mipmap is None:
         pmipmap = 0
@@ -28065,7 +27954,47 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
 
 @cython.embedsignature(True)
 def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Copies data between host and device.
+    """ Returns the layout properties of a sparse CUDA array.
+
+    Returns the layout properties of a sparse CUDA array in
+    `sparseProperties`. If the CUDA array is not allocated with flag
+    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
+    returned.
+
+    If the returned value in :py:obj:`~.cudaArraySparseProperties.flags`
+    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` represents the total
+    size of the array. Otherwise, it will be zero. Also, the returned value
+    in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
+    zero. Note that the `array` must have been allocated using
+    :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
+    arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
+    :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
+    :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
+    obtain the sparse properties of the entire CUDA mipmapped array to
+    which `array` belongs to.
+
+    Returns the layout properties of a sparse CUDA mipmapped array
+
+    Returns the sparse array layout properties in `sparseProperties`. If
+    the CUDA mipmapped array is not allocated with flag
+    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
+    returned.
+
+    For non-layered CUDA mipmapped arrays,
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` returns the size of
+    the mip tail region. The mip tail region includes all mip levels whose
+    width, height or depth is less than that of the tile. For layered CUDA
+    mipmapped arrays, if :py:obj:`~.cudaArraySparseProperties.flags`
+    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies the size of
+    the mip tail of all layers combined. Otherwise,
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies mip tail
+    size per layer. The returned value of
+    :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is valid only
+    if :py:obj:`~.cudaArraySparseProperties.miptailSize` is non-zero.
+
+    Copies data between host and device
 
     Copies `count` bytes from the memory area pointed to by `src` to the
     memory area pointed to by `dst`, where `kind` specifies the direction
@@ -28083,22 +28012,28 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
 
     Parameters
     ----------
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
+    sparseProperties : Any
+        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
+    array : Any
+        The CUDA array to get the sparse properties of
+    sparseProperties : size_t
+        Pointer to return :py:obj:`~.cudaArraySparseProperties`
+    mipmap : :py:obj:`~.cudaMemcpyKind`
+        The CUDA mipmapped array to get the sparse properties of
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
+    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+
+    :py:obj:`~.cudaArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+
     :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy`
     """
     cdef _HelperInputVoidPtrStruct cydstHelper
@@ -28740,7 +28675,7 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
     CUDA array. For CUDA array to CUDA array copies, the element size of
     the two CUDA arrays must match.
 
-    For a given operand, if :py:obj:`~.cudaMemcpy3DOperand`::type is
+    For a given operand, if :py:obj:`~.cudaMemcpy3DOperand.type` is
     specified as :py:obj:`~.cudaMemcpyOperandTypePointer`, then
     :py:obj:`~.cudaMemcpy3DOperand`::op::ptr will be used. The
     :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::ptr field must contain the
@@ -29342,7 +29277,7 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
     """
     with nogil:
         err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
@@ -29519,7 +29454,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -33621,61 +33556,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
 @cython.embedsignature(True)
 def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Creates a 1D memcpy node and adds it to a graph.
-
-    Creates a new 1D memcpy node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `pDependencies` may not have any
-    duplicate entries. A handle to the new node will be returned in
-    `pGraphNode`.
-
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
-    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
-    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
-    type of transfer is inferred from the pointer values. However,
-    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. Launching a memcpy node with dst and src
-    pointers that do not match the direction of the copy results in an
-    undefined behavior.
-
-    Memcpy nodes have some additional restrictions with regards to managed
-    memory, if the system contains at least one device which has a zero
-    value for the device attribute
-    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -33718,24 +33599,111 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode
 
 @cython.embedsignature(True)
 def cudaGraphMemcpyNodeGetParams(node):
-    """ Returns a memcpy node's parameters.
+    """ Creates a memcpy node to copy to a symbol on the device and adds it to a graph.
+
+    Creates a new memcpy node to copy to `symbol` and adds it to `graph`
+    with `numDependencies` dependencies specified via `pDependencies`. It
+    is possible for `numDependencies` to be 0, in which case the node will
+    be placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by
+    `offset` bytes from the start of symbol `symbol`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Creates a memcpy node to copy from a symbol on the device and adds it
+    to a graph
+
+    Creates a new memcpy node to copy from `symbol` and adds it to `graph`
+    with `numDependencies` dependencies specified via `pDependencies`. It
+    is possible for `numDependencies` to be 0, in which case the node will
+    be placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `offset` bytes from the start of symbol
+    `symbol` to the memory area pointed to by `dst`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Creates a 1D memcpy node and adds it to a graph
+
+    Creates a new 1D memcpy node and adds it to `graph` with
+    `numDependencies` dependencies specified via `pDependencies`. It is
+    possible for `numDependencies` to be 0, in which case the node will be
+    placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by `dst`,
+    where `kind` specifies the direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing. Launching a memcpy node with dst and src
+    pointers that do not match the direction of the copy results in an
+    undefined behavior.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Returns a memcpy node's parameters
 
     Returns the parameters of memcpy node `node` in `pNodeParams`.
 
     Parameters
     ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
+    pGraphNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
+        Returns newly created node
 
     Returns
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pNodeParams : :py:obj:`~.cudaMemcpy3DParms`
-        Pointer to return the parameters
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+    graph : :py:obj:`~.cudaMemcpy3DParms`
+        Graph to which to add the node
 
     See Also
     --------
+    :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
     :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`
     """
     cdef cyruntime.cudaGraphNode_t cynode
@@ -33796,46 +33764,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
 
 @cython.embedsignature(True)
 def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Sets a memcpy node's parameters to perform a 1-dimensional copy.
-
-    Sets the parameters of memcpy node `node` to the copy described by the
-    provided parameters.
-
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
-    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
-    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
-    type of transfer is inferred from the pointer values. However,
-    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. Launching a memcpy node with dst and src
-    pointers that do not match the direction of the copy results in an
-    undefined behavior.
-
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -33860,7 +33789,57 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None :
 
 @cython.embedsignature(True)
 def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pMemsetParams : Optional[cudaMemsetParams]):
-    """ Creates a memset node and adds it to a graph.
+    """ Sets a memcpy node's parameters to copy to a symbol on the device.
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by
+    `offset` bytes from the start of symbol `symbol`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Sets a memcpy node's parameters to copy from a symbol on the device
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `offset` bytes from the start of symbol
+    `symbol` to the memory area pointed to by `dst`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Sets a memcpy node's parameters to perform a 1-dimensional copy
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by `dst`,
+    where `kind` specifies the direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing. Launching a memcpy node with dst and src
+    pointers that do not match the direction of the copy results in an
+    undefined behavior.
+
+    Creates a memset node and adds it to a graph
 
     Creates a new memset node and adds it to `graph` with `numDependencies`
     dependencies specified via `pDependencies`. It is possible for
@@ -33873,24 +33852,33 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
     Parameters
     ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    pMemsetParams : :py:obj:`~.cudaMemsetParams`
-        Parameters for the memory set
+    symbol : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Device symbol address
+    src : list[:py:obj:`~.cudaGraphNode_t`]
+        Source memory address
+    count : size_t
+        Size in bytes to copy
+    offset : :py:obj:`~.cudaMemsetParams`
+        Offset from start of symbol in bytes
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
+    node : :py:obj:`~.cudaGraphNode_t`
+        Node to set the parameters for
 
     See Also
     --------
+    :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
+    :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
     :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaGraphMemsetNodeGetParams`, :py:obj:`~.cudaGraphMemsetNodeSetParams`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemcpyNode`
     """
     pDependencies = [] if pDependencies is None else pDependencies
@@ -34354,42 +34342,7 @@ def cudaGraphAddEmptyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 
 @cython.embedsignature(True)
 def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
-    """ Creates an event record node and adds it to a graph.
-
-    Creates a new event record node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
-
-    Each launch of the graph will record `event` to capture execution of
-    the node's dependencies.
-
-    These nodes may not be used in loops or conditionals.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    dependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    phGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -34434,26 +34387,7 @@ def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphN
 
 @cython.embedsignature(True)
 def cudaGraphEventRecordNodeGetEvent(node):
-    """ Returns the event associated with an event record node.
-
-    Returns the event of event record node `hNode` in `event_out`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the event for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    event_out : :py:obj:`~.cudaEvent_t`
-        Pointer to return the event
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -34474,26 +34408,7 @@ def cudaGraphEventRecordNodeGetEvent(node):
 
 @cython.embedsignature(True)
 def cudaGraphEventRecordNodeSetEvent(node, event):
-    """ Sets an event record node's event.
-
-    Sets the event of event record node `hNode` to `event`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the event for
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -34519,45 +34434,7 @@ def cudaGraphEventRecordNodeSetEvent(node, event):
 
 @cython.embedsignature(True)
 def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
-    """ Creates an event wait node and adds it to a graph.
-
-    Creates a new event wait node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
-
-    The graph node will wait for all work captured in `event`. See
-    :py:obj:`~.cuEventRecord()` for details on what is captured by an
-    event. The synchronization will be performed efficiently on the device
-    when applicable. `event` may be from a different context or device than
-    the launch stream.
-
-    These nodes may not be used in loops or conditionals.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    dependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    phGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -34602,26 +34479,7 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNod
 
 @cython.embedsignature(True)
 def cudaGraphEventWaitNodeGetEvent(node):
-    """ Returns the event associated with an event wait node.
-
-    Returns the event of event wait node `hNode` in `event_out`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the event for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    event_out : :py:obj:`~.cudaEvent_t`
-        Pointer to return the event
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -34642,26 +34500,7 @@ def cudaGraphEventWaitNodeGetEvent(node):
 
 @cython.embedsignature(True)
 def cudaGraphEventWaitNodeSetEvent(node, event):
-    """ Sets an event wait node's event.
-
-    Sets the event of event wait node `hNode` to `event`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the event for
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -34687,41 +34526,7 @@ def cudaGraphEventWaitNodeSetEvent(node, event):
 
 @cython.embedsignature(True)
 def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Creates an external semaphore signal node and adds it to a graph.
-
-    Creates a new external semaphore signal node and adds it to `graph`
-    with `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
-
-    Performs a signal operation on a set of externally allocated semaphore
-    objects when the node is launched. The operation(s) will occur after
-    all of the node's dependencies have completed.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -34759,32 +34564,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tup
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
-    """ Returns an external semaphore signal node's parameters.
-
-    Returns the parameters of an external semaphore signal node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
-    the node is destroyed or its parameters are modified, and should not be
-    modified directly. Use
-    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
-    the parameters of this node.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -34805,27 +34585,7 @@ def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Sets an external semaphore signal node's parameters.
-
-    Sets the parameters of an external semaphore signal node `hNode` to
-    `nodeParams`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Parameters to copy
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -34844,41 +34604,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
 
 @cython.embedsignature(True)
 def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Creates an external semaphore wait node and adds it to a graph.
-
-    Creates a new external semaphore wait node and adds it to `graph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
-
-    Performs a wait operation on a set of externally allocated semaphore
-    objects when the node is launched. The node's dependencies will not be
-    launched until the wait operation has completed.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -34916,32 +34642,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
-    """ Returns an external semaphore wait node's parameters.
-
-    Returns the parameters of an external semaphore wait node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
-    the node is destroyed or its parameters are modified, and should not be
-    modified directly. Use
-    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
-    the parameters of this node.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -34962,27 +34663,7 @@ def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Sets an external semaphore wait node's parameters.
-
-    Sets the parameters of an external semaphore wait node `hNode` to
-    `nodeParams`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Parameters to copy
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -35001,80 +34682,7 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
 
 @cython.embedsignature(True)
 def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaMemAllocNodeParams]):
-    """ Creates an allocation node and adds it to a graph.
-
-    Creates a new allocation node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
-
-    When :py:obj:`~.cudaGraphAddMemAllocNode` creates an allocation node,
-    it returns the address of the allocation in `nodeParams.dptr`. The
-    allocation's address remains fixed across instantiations and launches.
-
-    If the allocation is freed in the same graph, by creating a free node
-    using :py:obj:`~.cudaGraphAddMemFreeNode`, the allocation can be
-    accessed by nodes ordered after the allocation node but before the free
-    node. These allocations cannot be freed outside the owning graph, and
-    they can only be freed once in the owning graph.
-
-    If the allocation is not freed in the same graph, then it can be
-    accessed not only by nodes in the graph which are ordered after the
-    allocation node, but also by stream operations ordered after the
-    graph's execution but before the allocation is freed.
-
-    Allocations which are not freed in the same graph can be freed by:
-
-    - passing the allocation to :py:obj:`~.cudaMemFreeAsync` or
-      :py:obj:`~.cudaMemFree`;
-
-    - launching a graph with a free node for that allocation; or
-
-    - specifying :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`
-      during instantiation, which makes each launch behave as though it
-      called :py:obj:`~.cudaMemFreeAsync` for every unfreed allocation.
-
-    It is not possible to free an allocation in both the owning graph and
-    another graph. If the allocation is freed in the same graph, a free
-    node cannot be added to another graph. If the allocation is freed in
-    another graph, a free node can no longer be added to the owning graph.
-
-    The following restrictions apply to graphs which contain allocation
-    and/or memory free nodes:
-
-    - Nodes and edges of the graph cannot be deleted.
-
-    - The graph can only be used in a child node if the ownership is moved
-      to the parent.
-
-    - Only one instantiation of the graph may exist at any point in time.
-
-    - The graph cannot be cloned.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaMemAllocNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemAllocNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -35112,29 +34720,7 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode
 
 @cython.embedsignature(True)
 def cudaGraphMemAllocNodeGetParams(node):
-    """ Returns a memory alloc node's parameters.
-
-    Returns the parameters of a memory alloc node `hNode` in `params_out`.
-    The `poolProps` and `accessDescs` returned in `params_out`, are owned
-    by the node. This memory remains valid until the node is destroyed. The
-    returned parameters must not be modified.
-
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaMemAllocNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -35155,7 +34741,269 @@ def cudaGraphMemAllocNodeGetParams(node):
 
 @cython.embedsignature(True)
 def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dptr):
-    """ Creates a memory free node and adds it to a graph.
+    """"""
+    pDependencies = [] if pDependencies is None else pDependencies
+    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
+        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
+    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
+    if len(pDependencies) > 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
+        if cypDependencies is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
+        else:
+            for idx in range(len(pDependencies)):
+                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
+    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
+    cdef _HelperInputVoidPtrStruct cydptrHelper
+    cdef void* cydptr = _helper_input_void_ptr(dptr, &cydptrHelper)
+    with nogil:
+        err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
+        free(cypDependencies)
+    _helper_input_void_ptr_free(&cydptrHelper)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, pGraphNode)
+{{endif}}
+
+{{if 'cudaGraphMemFreeNodeGetParams' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphMemFreeNodeGetParams(node):
+    """"""
+    cdef cyruntime.cudaGraphNode_t cynode
+    if node is None:
+        pnode = 0
+    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
+        pnode = int(node)
+    else:
+        pnode = int(cudaGraphNode_t(node))
+    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
+    cdef void_ptr dptr_out = 0
+    cdef void* cydptr_out_ptr = <void*>&dptr_out
+    with nogil:
+        err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, dptr_out)
+{{endif}}
+
+{{if 'cudaDeviceGraphMemTrim' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGraphMemTrim(int device):
+    """"""
+    with nogil:
+        err = cyruntime.cudaDeviceGraphMemTrim(device)
+    return (_cudaError_t(err),)
+{{endif}}
+
+{{if 'cudaDeviceGetGraphMemAttribute' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType):
+    """"""
+    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
+    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, 0, is_getter=True)
+    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
+    with nogil:
+        err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, cyvalue.pyObj())
+{{endif}}
+
+{{if 'cudaDeviceSetGraphMemAttribute' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType, value):
+    """"""
+    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
+    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, value, is_getter=False)
+    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
+    with nogil:
+        err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    return (_cudaError_t(err),)
+{{endif}}
+
+{{if 'cudaGraphClone' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphClone(originalGraph):
+    """ Creates an event record node and adds it to a graph.
+
+    Creates a new event record node and adds it to `hGraph` with
+    `numDependencies` dependencies specified via `dependencies` and event
+    specified in `event`. It is possible for `numDependencies` to be 0, in
+    which case the node will be placed at the root of the graph.
+    `dependencies` may not have any duplicate entries. A handle to the new
+    node will be returned in `phGraphNode`.
+
+    Each launch of the graph will record `event` to capture execution of
+    the node's dependencies.
+
+    These nodes may not be used in loops or conditionals.
+
+    Returns the event associated with an event record node
+
+    Returns the event of event record node `hNode` in `event_out`.
+
+    Sets an event record node's event
+
+    Sets the event of event record node `hNode` to `event`.
+
+    Creates an event wait node and adds it to a graph
+
+    Creates a new event wait node and adds it to `hGraph` with
+    `numDependencies` dependencies specified via `dependencies` and event
+    specified in `event`. It is possible for `numDependencies` to be 0, in
+    which case the node will be placed at the root of the graph.
+    `dependencies` may not have any duplicate entries. A handle to the new
+    node will be returned in `phGraphNode`.
+
+    The graph node will wait for all work captured in `event`. See
+    :py:obj:`~.cuEventRecord()` for details on what is captured by an
+    event. The synchronization will be performed efficiently on the device
+    when applicable. `event` may be from a different context or device than
+    the launch stream.
+
+    These nodes may not be used in loops or conditionals.
+
+    Returns the event associated with an event wait node
+
+    Returns the event of event wait node `hNode` in `event_out`.
+
+    Sets an event wait node's event
+
+    Sets the event of event wait node `hNode` to `event`.
+
+    Creates an external semaphore signal node and adds it to a graph
+
+    Creates a new external semaphore signal node and adds it to `graph`
+    with `numDependencies` dependencies specified via `dependencies` and
+    arguments specified in `nodeParams`. It is possible for
+    `numDependencies` to be 0, in which case the node will be placed at the
+    root of the graph. `dependencies` may not have any duplicate entries. A
+    handle to the new node will be returned in `pGraphNode`.
+
+    Performs a signal operation on a set of externally allocated semaphore
+    objects when the node is launched. The operation(s) will occur after
+    all of the node's dependencies have completed.
+
+    Returns an external semaphore signal node's parameters
+
+    Returns the parameters of an external semaphore signal node `hNode` in
+    `params_out`. The `extSemArray` and `paramsArray` returned in
+    `params_out`, are owned by the node. This memory remains valid until
+    the node is destroyed or its parameters are modified, and should not be
+    modified directly. Use
+    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
+    the parameters of this node.
+
+    Sets an external semaphore signal node's parameters
+
+    Sets the parameters of an external semaphore signal node `hNode` to
+    `nodeParams`.
+
+    Creates an external semaphore wait node and adds it to a graph
+
+    Creates a new external semaphore wait node and adds it to `graph` with
+    `numDependencies` dependencies specified via `dependencies` and
+    arguments specified in `nodeParams`. It is possible for
+    `numDependencies` to be 0, in which case the node will be placed at the
+    root of the graph. `dependencies` may not have any duplicate entries. A
+    handle to the new node will be returned in `pGraphNode`.
+
+    Performs a wait operation on a set of externally allocated semaphore
+    objects when the node is launched. The node's dependencies will not be
+    launched until the wait operation has completed.
+
+    Returns an external semaphore wait node's parameters
+
+    Returns the parameters of an external semaphore wait node `hNode` in
+    `params_out`. The `extSemArray` and `paramsArray` returned in
+    `params_out`, are owned by the node. This memory remains valid until
+    the node is destroyed or its parameters are modified, and should not be
+    modified directly. Use
+    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
+    the parameters of this node.
+
+    Sets an external semaphore wait node's parameters
+
+    Sets the parameters of an external semaphore wait node `hNode` to
+    `nodeParams`.
+
+    Creates an allocation node and adds it to a graph
+
+    Creates a new allocation node and adds it to `graph` with
+    `numDependencies` dependencies specified via `pDependencies` and
+    arguments specified in `nodeParams`. It is possible for
+    `numDependencies` to be 0, in which case the node will be placed at the
+    root of the graph. `pDependencies` may not have any duplicate entries.
+    A handle to the new node will be returned in `pGraphNode`.
+
+    When :py:obj:`~.cudaGraphAddMemAllocNode` creates an allocation node,
+    it returns the address of the allocation in `nodeParams.dptr`. The
+    allocation's address remains fixed across instantiations and launches.
+
+    If the allocation is freed in the same graph, by creating a free node
+    using :py:obj:`~.cudaGraphAddMemFreeNode`, the allocation can be
+    accessed by nodes ordered after the allocation node but before the free
+    node. These allocations cannot be freed outside the owning graph, and
+    they can only be freed once in the owning graph.
+
+    If the allocation is not freed in the same graph, then it can be
+    accessed not only by nodes in the graph which are ordered after the
+    allocation node, but also by stream operations ordered after the
+    graph's execution but before the allocation is freed.
+
+    Allocations which are not freed in the same graph can be freed by:
+
+    - passing the allocation to :py:obj:`~.cudaMemFreeAsync` or
+      :py:obj:`~.cudaMemFree`;
+
+    - launching a graph with a free node for that allocation; or
+
+    - specifying :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`
+      during instantiation, which makes each launch behave as though it
+      called :py:obj:`~.cudaMemFreeAsync` for every unfreed allocation.
+
+    It is not possible to free an allocation in both the owning graph and
+    another graph. If the allocation is freed in the same graph, a free
+    node cannot be added to another graph. If the allocation is freed in
+    another graph, a free node can no longer be added to the owning graph.
+
+    The following restrictions apply to graphs which contain allocation
+    and/or memory free nodes:
+
+    - Nodes and edges of the graph cannot be deleted.
+
+    - The graph can only be used in a child node if the ownership is moved
+      to the parent.
+
+    - Only one instantiation of the graph may exist at any point in time.
+
+    - The graph cannot be cloned.
+
+    Returns a memory alloc node's parameters
+
+    Returns the parameters of a memory alloc node `hNode` in `params_out`.
+    The `poolProps` and `accessDescs` returned in `params_out`, are owned
+    by the node. This memory remains valid until the node is destroyed. The
+    returned parameters must not be modified.
+
+    Creates a memory free node and adds it to a graph
 
     Creates a new memory free node and adds it to `graph` with
     `numDependencies` dependencies specified via `pDependencies` and
@@ -35185,138 +35033,18 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_
 
     - The graph cannot be cloned.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    dptr : Any
-        Address of memory to free
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
-    pDependencies = [] if pDependencies is None else pDependencies
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
-        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
-    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(pDependencies)):
-                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
-    elif len(pDependencies) == 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
-    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef _HelperInputVoidPtrStruct cydptrHelper
-    cdef void* cydptr = _helper_input_void_ptr(dptr, &cydptrHelper)
-    with nogil:
-        err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr)
-    if len(pDependencies) > 1 and cypDependencies is not NULL:
-        free(cypDependencies)
-    _helper_input_void_ptr_free(&cydptrHelper)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, pGraphNode)
-{{endif}}
-
-{{if 'cudaGraphMemFreeNodeGetParams' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphMemFreeNodeGetParams(node):
-    """ Returns a memory free node's parameters.
+    Returns a memory free node's parameters
 
     Returns the address of a memory free node `hNode` in `dptr_out`.
 
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    dptr_out : Any
-        Pointer to return the device address
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
-    """
-    cdef cyruntime.cudaGraphNode_t cynode
-    if node is None:
-        pnode = 0
-    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
-        pnode = int(node)
-    else:
-        pnode = int(cudaGraphNode_t(node))
-    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef void_ptr dptr_out = 0
-    cdef void* cydptr_out_ptr = <void*>&dptr_out
-    with nogil:
-        err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, dptr_out)
-{{endif}}
-
-{{if 'cudaDeviceGraphMemTrim' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceGraphMemTrim(int device):
-    """ Free unused memory that was cached on the specified device for use with graphs back to the OS.
+    Free unused memory that was cached on the specified device for use with
+    graphs back to the OS.
 
     Blocks which are not in use by a graph that is either currently
     executing or scheduled to execute are freed back to the operating
     system.
 
-    Parameters
-    ----------
-    device : int
-        The device for which cached memory should be freed.
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    with nogil:
-        err = cyruntime.cudaDeviceGraphMemTrim(device)
-    return (_cudaError_t(err),)
-{{endif}}
-
-{{if 'cudaDeviceGetGraphMemAttribute' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType):
-    """ Query asynchronous allocation attributes related to graphs.
+    Query asynchronous allocation attributes related to graphs
 
     Valid attributes are:
 
@@ -35335,39 +35063,7 @@ def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
       memory, in bytes, currently allocated for use by the CUDA graphs
       asynchronous allocator.
 
-    Parameters
-    ----------
-    device : int
-        Specifies the scope of the query
-    attr : :py:obj:`~.cudaGraphMemAttributeType`
-        attribute to get
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
-    value : Any
-        retrieved value
-
-    See Also
-    --------
-    :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
-    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, 0, is_getter=True)
-    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    with nogil:
-        err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, cyvalue.pyObj())
-{{endif}}
-
-{{if 'cudaDeviceSetGraphMemAttribute' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType, value):
-    """ Set asynchronous allocation attributes related to graphs.
+    Set asynchronous allocation attributes related to graphs
 
     Valid attributes are:
 
@@ -35379,37 +35075,7 @@ def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
       memory, in bytes, currently allocated for use by the CUDA graphs
       asynchronous allocator.
 
-    Parameters
-    ----------
-    device : int
-        Specifies the scope of the query
-    attr : :py:obj:`~.cudaGraphMemAttributeType`
-        attribute to get
-    value : Any
-        pointer to value to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
-
-    See Also
-    --------
-    :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
-    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, value, is_getter=False)
-    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    with nogil:
-        err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
-    return (_cudaError_t(err),)
-{{endif}}
-
-{{if 'cudaGraphClone' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphClone(originalGraph):
-    """ Clones a graph.
+    Clones a graph
 
     This function creates a copy of `originalGraph` and returns it in
     `pGraphClone`. All parameters are copied into the cloned graph. The
@@ -35421,18 +35087,75 @@ def cudaGraphClone(originalGraph):
 
     Parameters
     ----------
-    originalGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to clone
+    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Graph to which to add the node
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
-    pGraphClone : :py:obj:`~.cudaGraph_t`
-        Returns newly created cloned graph
+    phGraphNode : :py:obj:`~.cudaGraph_t`
+        Returns newly created node
 
     See Also
     --------
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemAllocNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphNodeGetParams`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
+
+    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
+    :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
+    :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
     :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphNodeFindInClone`
 
     Notes
@@ -36453,6 +36176,27 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
 
 @cython.embedsignature(True)
 def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
+    """"""
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
+    with nogil:
+        err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, pGraphExec)
+{{endif}}
+
+{{if 'cudaGraphInstantiateWithParams' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraphInstantiateParams]):
     """ Creates an executable graph from a graph.
 
     Instantiates `graph` as an executable graph. The graph is validated for
@@ -36520,46 +36264,7 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
       - Both operands must be accessible from the current device, and the
         current device must match the device of other nodes in the graph.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to instantiate
-    flags : unsigned long long
-        Flags to control instantiation. See
-        :py:obj:`~.CUgraphInstantiate_flags`.
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphExec : :py:obj:`~.cudaGraphExec_t`
-        Returns instantiated graph
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
-    """
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    with nogil:
-        err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, pGraphExec)
-{{endif}}
-
-{{if 'cudaGraphInstantiateWithParams' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraphInstantiateParams]):
-    """ Creates an executable graph from a graph.
+    Creates an executable graph from a graph
 
     Instantiates `graph` as an executable graph according to the
     `instantiateParams` structure. The graph is validated for any
@@ -36671,18 +36376,22 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
     ----------
     graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
         Graph to instantiate
-    instantiateParams : :py:obj:`~.cudaGraphInstantiateParams`
-        Instantiation parameters
+    flags : :py:obj:`~.cudaGraphInstantiateParams`
+        Flags to control instantiation. See
+        :py:obj:`~.CUgraphInstantiate_flags`.
 
     Returns
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphExec : :py:obj:`~.cudaGraphExec_t`
         Returns instantiated graph
 
     See Also
     --------
+    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
+
     :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphInstantiateWithFlags`, :py:obj:`~.cudaGraphExecDestroy`
     """
     cdef cyruntime.cudaGraph_t cygraph
@@ -36896,49 +36605,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 
 @cython.embedsignature(True)
 def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy.
-
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained the given params at instantiation. `node` must remain in
-    the graph which was used to instantiate `hGraphExec`. Changed edges to
-    and from `node` are ignored.
-
-    `src` and `dst` must be allocated from the same contexts as the
-    original source and destination memory. The instantiation-time memory
-    operands must be 1-dimensional. Zero-length operations are not
-    supported.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
-
-    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
-    mappings changed or the original memory operands are multidimensional.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Memcpy node from the graph which was used to instantiate graphExec
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNode1D`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -36971,7 +36638,66 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count,
 
 @cython.embedsignature(True)
 def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cudaMemsetParams]):
-    """ Sets the parameters for a memset node in the given graphExec.
+    """ Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device.
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `src` and `symbol` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memcpy node in the given graphExec to copy
+    from a symbol on the device
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `symbol` and `dst` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memcpy node in the given graphExec to perform
+    a 1-dimensional copy
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `src` and `dst` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memset node in the given graphExec.
 
     Updates the work represented by `node` in `hGraphExec` as though `node`
     had contained `pNodeParams` at instantiation. `node` must remain in the
@@ -37002,17 +36728,26 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
         The executable graph in which to set the specified node
     node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Memset node from the graph which was used to instantiate graphExec
-    pNodeParams : :py:obj:`~.cudaMemsetParams`
-        Updated Parameters to set
+        Memcpy node from the graph which was used to instantiate graphExec
+    symbol : :py:obj:`~.cudaMemsetParams`
+        Device symbol address
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNode1D`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
     :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddMemsetNode`, :py:obj:`~.cudaGraphMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
     """
     cdef cyruntime.cudaGraphNode_t cynode
@@ -37096,43 +36831,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
 
 @cython.embedsignature(True)
 def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
-    """ Updates node parameters in the child graph node in the given graphExec.
-
-    Updates the work represented by `node` in `hGraphExec` as though the
-    nodes contained in `node's` graph had the parameters contained in
-    `childGraph's` nodes at instantiation. `node` must remain in the graph
-    which was used to instantiate `hGraphExec`. Changed edges to and from
-    `node` are ignored.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
-
-    The topology of `childGraph`, as well as the node insertion order, must
-    match that of the graph contained in `node`. See
-    :py:obj:`~.cudaGraphExecUpdate()` for a list of restrictions on what
-    can be updated in an instantiated graph. The update is recursive, so
-    child graph nodes contained within the top level child graph will also
-    be updated.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Host node from the graph which was used to instantiate graphExec
-    childGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        The graph supplying the updated parameters
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphChildGraphNodeGetGraph`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraph_t cychildGraph
     if childGraph is None:
         pchildGraph = 0
@@ -37166,36 +36865,7 @@ def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
 
 @cython.embedsignature(True)
 def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
-    """ Sets the event for an event record node in the given graphExec.
-
-    Sets the event of an event record node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Event record node from the graph from which graphExec was
-        instantiated
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Updated event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -37229,36 +36899,7 @@ def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
 
 @cython.embedsignature(True)
 def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
-    """ Sets the event for an event wait node in the given graphExec.
-
-    Sets the event of an event wait node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Event wait node from the graph from which graphExec was
-        instantiated
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Updated event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -37292,40 +36933,7 @@ def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
 
 @cython.embedsignature(True)
 def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Sets the parameters for an external semaphore signal node in the given graphExec.
-
-    Sets the parameters of an external semaphore signal node in an
-    executable graph `hGraphExec`. The node is identified by the
-    corresponding node `hNode` in the non-executable graph, from which the
-    executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Changing `nodeParams->numExtSems` is not supported.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        semaphore signal node from the graph from which graphExec was
-        instantiated
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Updated Parameters to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -37352,40 +36960,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
 
 @cython.embedsignature(True)
 def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Sets the parameters for an external semaphore wait node in the given graphExec.
-
-    Sets the parameters of an external semaphore wait node in an executable
-    graph `hGraphExec`. The node is identified by the corresponding node
-    `hNode` in the non-executable graph, from which the executable graph
-    was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Changing `nodeParams->numExtSems` is not supported.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        semaphore wait node from the graph from which graphExec was
-        instantiated
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Updated Parameters to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -37412,44 +36987,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
 
 @cython.embedsignature(True)
 def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
-    """ Enables or disables the specified node in the given graphExec.
-
-    Sets `hNode` to be either enabled or disabled. Disabled nodes are
-    functionally equivalent to empty nodes until they are reenabled.
-    Existing node parameters are not affected by disabling/enabling the
-    node.
-
-    The node is identified by the corresponding node `hNode` in the non-
-    executable graph, from which the executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node from the graph from which graphExec was instantiated
-    isEnabled : unsigned int
-        Node is enabled if != 0, otherwise the node is disabled
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
-
-    Notes
-    -----
-    Currently only kernel, memset and memcpy nodes are supported.
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -37475,37 +37013,7 @@ def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 
 @cython.embedsignature(True)
 def cudaGraphNodeGetEnabled(hGraphExec, hNode):
-    """ Query whether a node in the given graphExec is enabled.
-
-    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
-
-    The node is identified by the corresponding node `hNode` in the non-
-    executable graph, from which the executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node from the graph from which graphExec was instantiated
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-    isEnabled : unsigned int
-        Location to return the enabled status of the node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
-
-    Notes
-    -----
-    Currently only kernel, memset and memcpy nodes are supported.
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -37534,7 +37042,106 @@ def cudaGraphNodeGetEnabled(hGraphExec, hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExecUpdate(hGraphExec, hGraph):
-    """ Check whether an executable graph can be updated with a graph and perform the update if possible.
+    """ Updates node parameters in the child graph node in the given graphExec.
+
+    Updates the work represented by `node` in `hGraphExec` as though the
+    nodes contained in `node's` graph had the parameters contained in
+    `childGraph's` nodes at instantiation. `node` must remain in the graph
+    which was used to instantiate `hGraphExec`. Changed edges to and from
+    `node` are ignored.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    The topology of `childGraph`, as well as the node insertion order, must
+    match that of the graph contained in `node`. See
+    :py:obj:`~.cudaGraphExecUpdate()` for a list of restrictions on what
+    can be updated in an instantiated graph. The update is recursive, so
+    child graph nodes contained within the top level child graph will also
+    be updated.
+
+    Sets the event for an event record node in the given graphExec
+
+    Sets the event of an event record node in an executable graph
+    `hGraphExec`. The node is identified by the corresponding node `hNode`
+    in the non-executable graph, from which the executable graph was
+    instantiated.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Sets the event for an event wait node in the given graphExec
+
+    Sets the event of an event wait node in an executable graph
+    `hGraphExec`. The node is identified by the corresponding node `hNode`
+    in the non-executable graph, from which the executable graph was
+    instantiated.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Sets the parameters for an external semaphore signal node in the given
+    graphExec
+
+    Sets the parameters of an external semaphore signal node in an
+    executable graph `hGraphExec`. The node is identified by the
+    corresponding node `hNode` in the non-executable graph, from which the
+    executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Changing `nodeParams->numExtSems` is not supported.
+
+    Sets the parameters for an external semaphore wait node in the given
+    graphExec
+
+    Sets the parameters of an external semaphore wait node in an executable
+    graph `hGraphExec`. The node is identified by the corresponding node
+    `hNode` in the non-executable graph, from which the executable graph
+    was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Changing `nodeParams->numExtSems` is not supported.
+
+    Enables or disables the specified node in the given graphExec
+
+    Sets `hNode` to be either enabled or disabled. Disabled nodes are
+    functionally equivalent to empty nodes until they are reenabled.
+    Existing node parameters are not affected by disabling/enabling the
+    node.
+
+    The node is identified by the corresponding node `hNode` in the non-
+    executable graph, from which the executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Query whether a node in the given graphExec is enabled
+
+    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
+
+    The node is identified by the corresponding node `hNode` in the non-
+    executable graph, from which the executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    Check whether an executable graph can be updated with a graph and
+    perform the update if possible
 
     Updates the node parameters in the instantiated graph specified by
     `hGraphExec` with the node parameters in a topologically identical
@@ -37667,20 +37274,47 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     Parameters
     ----------
     hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The instantiated graph to be updated
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        The graph containing the updated parameters
+        The executable graph in which to set the specified node
+    node : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Host node from the graph which was used to instantiate graphExec
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorGraphExecUpdateFailure`,
-    resultInfo : :py:obj:`~.cudaGraphExecUpdateResultInfo`
-        the error info structure
+    childGraph : :py:obj:`~.cudaGraphExecUpdateResultInfo`
+        The graph supplying the updated parameters
 
     See Also
     --------
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphChildGraphNodeGetGraph`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphNodeGetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
+
+    :py:obj:`~.cudaGraphNodeSetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
+
     :py:obj:`~.cudaGraphInstantiate`
+
+    Notes
+    -----
+    Currently only kernel, memset and memcpy nodes are supported.
+
+    Currently only kernel, memset and memcpy nodes are supported.
     """
     cdef cyruntime.cudaGraph_t cyhGraph
     if hGraph is None:
@@ -37710,30 +37344,7 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
 
 @cython.embedsignature(True)
 def cudaGraphUpload(graphExec, stream):
-    """ Uploads an executable graph in a stream.
-
-    Uploads `hGraphExec` to the device in `hStream` without executing it.
-    Uploads of the same `hGraphExec` will be serialized. Each upload is
-    ordered behind both any previous work in `hStream` and any previous
-    launches of `hGraphExec`. Uses memory cached by `stream` to back the
-    allocations owned by `graphExec`.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        Executable graph to upload
-    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to upload the graph
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
-    """
+    """"""
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
         pstream = 0
@@ -37759,7 +37370,15 @@ def cudaGraphUpload(graphExec, stream):
 
 @cython.embedsignature(True)
 def cudaGraphLaunch(graphExec, stream):
-    """ Launches an executable graph in a stream.
+    """ Uploads an executable graph in a stream.
+
+    Uploads `hGraphExec` to the device in `hStream` without executing it.
+    Uploads of the same `hGraphExec` will be serialized. Each upload is
+    ordered behind both any previous work in `hStream` and any previous
+    launches of `hGraphExec`. Uses memory cached by `stream` to back the
+    allocations owned by `graphExec`.
+
+    Launches an executable graph in a stream
 
     Executes `graphExec` in `stream`. Only one instance of `graphExec` may
     be executing at a time. Each launch is ordered behind both any previous
@@ -37774,18 +37393,21 @@ def cudaGraphLaunch(graphExec, stream):
 
     Parameters
     ----------
-    graphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        Executable graph to launch
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to launch the graph
+    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
+        Executable graph to upload
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        Stream in which to upload the graph
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
+    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
+
     :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphExecDestroy`
     """
     cdef cyruntime.cudaStream_t cystream
@@ -39495,7 +39117,8 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
     input : :py:obj:`~.cudaDevResource`
-        Input SM resource to be split. Must be a valid `None` resource.
+        Input SM resource to be split. Must be a valid `cudaDevSmResource`
+        resource.
     flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index 6262059dfa..dd305dfce0 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -5,24 +5,6 @@
 driver
 ------
 
-Profiler Control
-----------------
-
-This section describes the profiler control functions of the low-level CUDA driver application programming interface.
-
-.. autofunction:: cuda.bindings.driver.cuProfilerStart
-.. autofunction:: cuda.bindings.driver.cuProfilerStop
-
-VDPAU Interoperability
-----------------------
-
-This section describes the VDPAU interoperability functions of the low-level CUDA driver application programming interface.
-
-.. autofunction:: cuda.bindings.driver.cuVDPAUGetDevice
-.. autofunction:: cuda.bindings.driver.cuVDPAUCtxCreate
-.. autofunction:: cuda.bindings.driver.cuGraphicsVDPAURegisterVideoSurface
-.. autofunction:: cuda.bindings.driver.cuGraphicsVDPAURegisterOutputSurface
-
 Data types used by CUDA driver
 ------------------------------
 
@@ -498,7 +480,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_ATOMIC_REDUCTION
 
 
-        Perform a atomic reduction. See :py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction
+        Perform a atomic reduction. See :py:obj:`~.CUstreamBatchMemOpParams.atomicReduction`
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES
@@ -3573,7 +3555,7 @@ Data types used by CUDA driver
 
         Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
 
-         :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
+         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
 
          Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. 
 
@@ -3597,7 +3579,7 @@ Data types used by CUDA driver
 
          This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
 
-         Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled).
+         Valid values for :py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
 
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE
@@ -7770,6 +7752,32 @@ Checkpoint and restore capabilities are currently restricted to Linux.
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessRestore
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessUnlock
 
+Profiler Control
+----------------
+
+This section describes the profiler control functions of the low-level CUDA driver application programming interface.
+
+.. autofunction:: cuda.bindings.driver.cuProfilerStart
+.. autofunction:: cuda.bindings.driver.cuProfilerStop
+
+EGL Interoperability
+--------------------
+
+This section describes the EGL interoperability functions of the low-level CUDA driver application programming interface.
+
+.. autofunction:: cuda.bindings.driver.cuGraphicsEGLRegisterImage
+.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerConnect
+.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerConnectWithFlags
+.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerDisconnect
+.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerAcquireFrame
+.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerReleaseFrame
+.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerConnect
+.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerDisconnect
+.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerPresentFrame
+.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerReturnFrame
+.. autofunction:: cuda.bindings.driver.cuGraphicsResourceGetMappedEglFrame
+.. autofunction:: cuda.bindings.driver.cuEventCreateFromEGLSync
+
 OpenGL Interoperability
 -----------------------
 
@@ -7798,20 +7806,12 @@ This section describes the OpenGL interoperability functions of the low-level CU
 .. autofunction:: cuda.bindings.driver.cuGraphicsGLRegisterImage
 .. autofunction:: cuda.bindings.driver.cuGLGetDevices
 
-EGL Interoperability
---------------------
+VDPAU Interoperability
+----------------------
 
-This section describes the EGL interoperability functions of the low-level CUDA driver application programming interface.
+This section describes the VDPAU interoperability functions of the low-level CUDA driver application programming interface.
 
-.. autofunction:: cuda.bindings.driver.cuGraphicsEGLRegisterImage
-.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerConnect
-.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerConnectWithFlags
-.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerDisconnect
-.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerAcquireFrame
-.. autofunction:: cuda.bindings.driver.cuEGLStreamConsumerReleaseFrame
-.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerConnect
-.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerDisconnect
-.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerPresentFrame
-.. autofunction:: cuda.bindings.driver.cuEGLStreamProducerReturnFrame
-.. autofunction:: cuda.bindings.driver.cuGraphicsResourceGetMappedEglFrame
-.. autofunction:: cuda.bindings.driver.cuEventCreateFromEGLSync
+.. autofunction:: cuda.bindings.driver.cuVDPAUGetDevice
+.. autofunction:: cuda.bindings.driver.cuVDPAUCtxCreate
+.. autofunction:: cuda.bindings.driver.cuGraphicsVDPAURegisterVideoSurface
+.. autofunction:: cuda.bindings.driver.cuGraphicsVDPAURegisterOutputSurface
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index 959c37a7f4..d747ae0deb 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -----
@@ -654,7 +654,7 @@ Programmer assertion that all kernel pointer parameters are restrict pointers.
 
   - ``--device-as-default-execution-space``\  (``-default-device``\ )
 
-Treat entities with no execution space annotation as ``device``\  entities.
+Treat entities with no execution space annotation as ``__device__``\  entities.
 
 
 
@@ -664,7 +664,7 @@ Treat entities with no execution space annotation as ``device``\  entities.
 
   - ``--device-int128``\  (``-device-int128``\ )
 
-Allow the ``__int128``\  type in device code. Also causes the macro ``CUDACC_RTC_INT128``\  to be defined.
+Allow the ``__int128``\  type in device code. Also causes the macro ``__CUDACC_RTC_INT128__``\  to be defined.
 
 
 
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index d7924c232a..0da84a3922 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -5,6332 +5,6296 @@
 runtime
 -------
 
-Profiler Control
-----------------
-
-This section describes the profiler control functions of the CUDA runtime application programming interface.
-
-.. autofunction:: cuda.bindings.runtime.cudaProfilerStart
-.. autofunction:: cuda.bindings.runtime.cudaProfilerStop
-
-Device Management
------------------
-
-impl_private
-
-
-
-
-
-
-
-This section describes the device management functions of the CUDA runtime application programming interface.
-
-.. autofunction:: cuda.bindings.runtime.cudaDeviceReset
-.. autofunction:: cuda.bindings.runtime.cudaDeviceSynchronize
-.. autofunction:: cuda.bindings.runtime.cudaDeviceSetLimit
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetLimit
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetTexture1DLinearMaxWidth
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetCacheConfig
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetStreamPriorityRange
-.. autofunction:: cuda.bindings.runtime.cudaDeviceSetCacheConfig
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetByPCIBusId
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetPCIBusId
-.. autofunction:: cuda.bindings.runtime.cudaIpcGetEventHandle
-.. autofunction:: cuda.bindings.runtime.cudaIpcOpenEventHandle
-.. autofunction:: cuda.bindings.runtime.cudaIpcGetMemHandle
-.. autofunction:: cuda.bindings.runtime.cudaIpcOpenMemHandle
-.. autofunction:: cuda.bindings.runtime.cudaIpcCloseMemHandle
-.. autofunction:: cuda.bindings.runtime.cudaDeviceFlushGPUDirectRDMAWrites
-.. autofunction:: cuda.bindings.runtime.cudaDeviceRegisterAsyncNotification
-.. autofunction:: cuda.bindings.runtime.cudaDeviceUnregisterAsyncNotification
-.. autofunction:: cuda.bindings.runtime.cudaGetDeviceCount
-.. autofunction:: cuda.bindings.runtime.cudaGetDeviceProperties
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetHostAtomicCapabilities
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetDefaultMemPool
-.. autofunction:: cuda.bindings.runtime.cudaDeviceSetMemPool
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetMemPool
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetNvSciSyncAttributes
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAttribute
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAtomicCapabilities
-.. autofunction:: cuda.bindings.runtime.cudaChooseDevice
-.. autofunction:: cuda.bindings.runtime.cudaInitDevice
-.. autofunction:: cuda.bindings.runtime.cudaSetDevice
-.. autofunction:: cuda.bindings.runtime.cudaGetDevice
-.. autofunction:: cuda.bindings.runtime.cudaSetDeviceFlags
-.. autofunction:: cuda.bindings.runtime.cudaGetDeviceFlags
-
-Error Handling
---------------
-
-This section describes the error handling functions of the CUDA runtime application programming interface.
-
-.. autofunction:: cuda.bindings.runtime.cudaGetLastError
-.. autofunction:: cuda.bindings.runtime.cudaPeekAtLastError
-.. autofunction:: cuda.bindings.runtime.cudaGetErrorName
-.. autofunction:: cuda.bindings.runtime.cudaGetErrorString
-
-Stream Management
------------------
-
-This section describes the stream management functions of the CUDA runtime application programming interface.
-
-.. autoclass:: cuda.bindings.runtime.cudaStreamCallback_t
-.. autofunction:: cuda.bindings.runtime.cudaStreamCreate
-.. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithFlags
-.. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithPriority
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetPriority
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetFlags
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetId
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetDevice
-.. autofunction:: cuda.bindings.runtime.cudaCtxResetPersistingL2Cache
-.. autofunction:: cuda.bindings.runtime.cudaStreamCopyAttributes
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaStreamSetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaStreamDestroy
-.. autofunction:: cuda.bindings.runtime.cudaStreamWaitEvent
-.. autofunction:: cuda.bindings.runtime.cudaStreamAddCallback
-.. autofunction:: cuda.bindings.runtime.cudaStreamSynchronize
-.. autofunction:: cuda.bindings.runtime.cudaStreamQuery
-.. autofunction:: cuda.bindings.runtime.cudaStreamAttachMemAsync
-.. autofunction:: cuda.bindings.runtime.cudaStreamBeginCapture
-.. autofunction:: cuda.bindings.runtime.cudaStreamBeginCaptureToGraph
-.. autofunction:: cuda.bindings.runtime.cudaThreadExchangeStreamCaptureMode
-.. autofunction:: cuda.bindings.runtime.cudaStreamEndCapture
-.. autofunction:: cuda.bindings.runtime.cudaStreamIsCapturing
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetCaptureInfo
-.. autofunction:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependencies
-
-Event Management
-----------------
-
-This section describes the event management functions of the CUDA runtime application programming interface.
-
-.. autofunction:: cuda.bindings.runtime.cudaEventCreate
-.. autofunction:: cuda.bindings.runtime.cudaEventCreateWithFlags
-.. autofunction:: cuda.bindings.runtime.cudaEventRecord
-.. autofunction:: cuda.bindings.runtime.cudaEventRecordWithFlags
-.. autofunction:: cuda.bindings.runtime.cudaEventQuery
-.. autofunction:: cuda.bindings.runtime.cudaEventSynchronize
-.. autofunction:: cuda.bindings.runtime.cudaEventDestroy
-.. autofunction:: cuda.bindings.runtime.cudaEventElapsedTime
-
-External Resource Interoperability
-----------------------------------
-
-This section describes the external resource interoperability functions of the CUDA runtime application programming interface.
-
-.. autofunction:: cuda.bindings.runtime.cudaImportExternalMemory
-.. autofunction:: cuda.bindings.runtime.cudaExternalMemoryGetMappedBuffer
-.. autofunction:: cuda.bindings.runtime.cudaExternalMemoryGetMappedMipmappedArray
-.. autofunction:: cuda.bindings.runtime.cudaDestroyExternalMemory
-.. autofunction:: cuda.bindings.runtime.cudaImportExternalSemaphore
-.. autofunction:: cuda.bindings.runtime.cudaSignalExternalSemaphoresAsync
-.. autofunction:: cuda.bindings.runtime.cudaWaitExternalSemaphoresAsync
-.. autofunction:: cuda.bindings.runtime.cudaDestroyExternalSemaphore
-
-Execution Control
------------------
-
-This section describes the execution control functions of the CUDA runtime application programming interface.
+Data types used by CUDA Runtime
+-------------------------------
 
 
 
-Some functions have overloaded C++ API template versions documented separately in the C++ API Routines module.
+.. autoclass:: cuda.bindings.runtime.cudaChannelFormatDesc
+.. autoclass:: cuda.bindings.runtime.cudaArraySparseProperties
+.. autoclass:: cuda.bindings.runtime.cudaArrayMemoryRequirements
+.. autoclass:: cuda.bindings.runtime.cudaPitchedPtr
+.. autoclass:: cuda.bindings.runtime.cudaExtent
+.. autoclass:: cuda.bindings.runtime.cudaPos
+.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DParms
+.. autoclass:: cuda.bindings.runtime.cudaMemcpyNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DPeerParms
+.. autoclass:: cuda.bindings.runtime.cudaMemsetParams
+.. autoclass:: cuda.bindings.runtime.cudaMemsetParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaAccessPolicyWindow
+.. autoclass:: cuda.bindings.runtime.cudaHostNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaHostNodeParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaResourceDesc
+.. autoclass:: cuda.bindings.runtime.cudaResourceViewDesc
+.. autoclass:: cuda.bindings.runtime.cudaPointerAttributes
+.. autoclass:: cuda.bindings.runtime.cudaFuncAttributes
+.. autoclass:: cuda.bindings.runtime.cudaMemLocation
+.. autoclass:: cuda.bindings.runtime.cudaMemAccessDesc
+.. autoclass:: cuda.bindings.runtime.cudaMemPoolProps
+.. autoclass:: cuda.bindings.runtime.cudaMemPoolPtrExportData
+.. autoclass:: cuda.bindings.runtime.cudaMemAllocNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaMemAllocNodeParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaMemFreeNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaMemcpyAttributes
+.. autoclass:: cuda.bindings.runtime.cudaOffset3D
+.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DOperand
+.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DBatchOp
+.. autoclass:: cuda.bindings.runtime.CUuuid_st
+.. autoclass:: cuda.bindings.runtime.cudaDeviceProp
+.. autoclass:: cuda.bindings.runtime.cudaIpcEventHandle_st
+.. autoclass:: cuda.bindings.runtime.cudaIpcMemHandle_st
+.. autoclass:: cuda.bindings.runtime.cudaMemFabricHandle_st
+.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryHandleDesc
+.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryBufferDesc
+.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryMipmappedArrayDesc
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreHandleDesc
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalParams
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitParams
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResource
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigResource
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueResource
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams_st
+.. autoclass:: cuda.bindings.runtime.cudaDevResource_st
+.. autoclass:: cuda.bindings.runtime.cudalibraryHostUniversalFunctionAndDataTable
+.. autoclass:: cuda.bindings.runtime.cudaKernelNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaKernelNodeParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalNodeParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitNodeParamsV2
+.. autoclass:: cuda.bindings.runtime.cudaConditionalNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaChildGraphNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaEventRecordNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaEventWaitNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaGraphNodeParams
+.. autoclass:: cuda.bindings.runtime.cudaGraphEdgeData_st
+.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateParams_st
+.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResultInfo_st
+.. autoclass:: cuda.bindings.runtime.cudaGraphKernelNodeUpdate
+.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomainMap_st
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeValue
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttribute_st
+.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationInfo
+.. autoclass:: cuda.bindings.runtime.cudaTextureDesc
+.. autoclass:: cuda.bindings.runtime.cudaEglPlaneDesc_st
+.. autoclass:: cuda.bindings.runtime.cudaEglFrame_st
+.. autoclass:: cuda.bindings.runtime.cudaError_t
 
-.. autofunction:: cuda.bindings.runtime.cudaFuncSetCacheConfig
-.. autofunction:: cuda.bindings.runtime.cudaFuncGetAttributes
-.. autofunction:: cuda.bindings.runtime.cudaFuncSetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaFuncGetParamCount
-.. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc
-.. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc_v2
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaSuccess
 
-Occupancy
----------
 
-This section describes the occupancy calculation functions of the CUDA runtime application programming interface.
+        The API call returned with no errors. In the case of query calls, this also means that the operation being queried is complete (see :py:obj:`~.cudaEventQuery()` and :py:obj:`~.cudaStreamQuery()`).
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidValue
 
-Besides the occupancy calculator functions (cudaOccupancyMaxActiveBlocksPerMultiprocessor and cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), there are also C++ only occupancy-based launch configuration functions documented in C++ API Routines module.
 
+        This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
 
 
-See cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API) cudaOccupancyAvailableDynamicSMemPerBlock (C++ API),
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMemoryAllocation
 
-.. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor
-.. autofunction:: cuda.bindings.runtime.cudaOccupancyAvailableDynamicSMemPerBlock
-.. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
 
-Memory Management
------------------
+        The API call failed because it was unable to allocate enough memory or other resources to perform the requested operation.
 
-This section describes the memory management functions of the CUDA runtime application programming interface.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInitializationError
 
 
-Some functions have overloaded C++ API template versions documented separately in the C++ API Routines module.
+        The API call failed because the CUDA driver and runtime could not be initialized.
 
-.. autofunction:: cuda.bindings.runtime.make_cudaPitchedPtr
-.. autofunction:: cuda.bindings.runtime.make_cudaPos
-.. autofunction:: cuda.bindings.runtime.make_cudaExtent
-.. autofunction:: cuda.bindings.runtime.cudaMallocManaged
-.. autofunction:: cuda.bindings.runtime.cudaMalloc
-.. autofunction:: cuda.bindings.runtime.cudaMallocHost
-.. autofunction:: cuda.bindings.runtime.cudaMallocPitch
-.. autofunction:: cuda.bindings.runtime.cudaMallocArray
-.. autofunction:: cuda.bindings.runtime.cudaFree
-.. autofunction:: cuda.bindings.runtime.cudaFreeHost
-.. autofunction:: cuda.bindings.runtime.cudaFreeArray
-.. autofunction:: cuda.bindings.runtime.cudaFreeMipmappedArray
-.. autofunction:: cuda.bindings.runtime.cudaHostAlloc
-.. autofunction:: cuda.bindings.runtime.cudaHostRegister
-.. autofunction:: cuda.bindings.runtime.cudaHostUnregister
-.. autofunction:: cuda.bindings.runtime.cudaHostGetDevicePointer
-.. autofunction:: cuda.bindings.runtime.cudaHostGetFlags
-.. autofunction:: cuda.bindings.runtime.cudaMalloc3D
-.. autofunction:: cuda.bindings.runtime.cudaMalloc3DArray
-.. autofunction:: cuda.bindings.runtime.cudaMallocMipmappedArray
-.. autofunction:: cuda.bindings.runtime.cudaGetMipmappedArrayLevel
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3D
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DPeer
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DPeerAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemGetInfo
-.. autofunction:: cuda.bindings.runtime.cudaArrayGetInfo
-.. autofunction:: cuda.bindings.runtime.cudaArrayGetPlane
-.. autofunction:: cuda.bindings.runtime.cudaArrayGetMemoryRequirements
-.. autofunction:: cuda.bindings.runtime.cudaMipmappedArrayGetMemoryRequirements
-.. autofunction:: cuda.bindings.runtime.cudaArrayGetSparseProperties
-.. autofunction:: cuda.bindings.runtime.cudaMipmappedArrayGetSparseProperties
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy
-.. autofunction:: cuda.bindings.runtime.cudaMemcpyPeer
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2D
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DToArray
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DFromArray
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DArrayToArray
-.. autofunction:: cuda.bindings.runtime.cudaMemcpyAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpyPeerAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpyBatchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DBatchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpyWithAttributesAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DWithAttributesAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DToArrayAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DFromArrayAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemset
-.. autofunction:: cuda.bindings.runtime.cudaMemset2D
-.. autofunction:: cuda.bindings.runtime.cudaMemset3D
-.. autofunction:: cuda.bindings.runtime.cudaMemsetAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemset2DAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemset3DAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchBatchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemDiscardBatchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemDiscardAndPrefetchBatchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemAdvise
-.. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttributes
 
-Stream Ordered Memory Allocator
--------------------------------
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCudartUnloading
 
-**overview**
 
+        This indicates that a CUDA Runtime API call cannot be executed because it is being called during process shut down, at a point in time after CUDA driver has been unloaded.
 
 
-The asynchronous allocator allows the user to allocate and free in stream order. All asynchronous accesses of the allocation must happen between the stream executions of the allocation and the free. If the memory is accessed outside of the promised stream order, a use before allocation / use after free error will cause undefined behavior.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerDisabled
 
-The allocator is free to reallocate the memory as long as it can guarantee that compliant memory accesses will not overlap temporally. The allocator may refer to internal stream ordering as well as inter-stream dependencies (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
 
+        This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerNotInitialized
 
 
-**Supported Platforms**
+        [Deprecated]
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerAlreadyStarted
 
-Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cudaDeviceGetAttribute() with the device attribute cudaDevAttrMemoryPoolsSupported.
 
-.. autofunction:: cuda.bindings.runtime.cudaMallocAsync
-.. autofunction:: cuda.bindings.runtime.cudaFreeAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolTrimTo
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolSetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolGetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolSetAccess
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolGetAccess
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolCreate
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolDestroy
-.. autofunction:: cuda.bindings.runtime.cudaMemGetDefaultMemPool
-.. autofunction:: cuda.bindings.runtime.cudaMemGetMemPool
-.. autofunction:: cuda.bindings.runtime.cudaMemSetMemPool
-.. autofunction:: cuda.bindings.runtime.cudaMallocFromPoolAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolExportToShareableHandle
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolImportFromShareableHandle
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolExportPointer
-.. autofunction:: cuda.bindings.runtime.cudaMemPoolImportPointer
+        [Deprecated]
 
-Unified Addressing
-------------------
 
-This section describes the unified addressing functions of the CUDA runtime application programming interface.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerAlreadyStopped
 
 
+        [Deprecated]
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidConfiguration
 
-**Overview**
 
+        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device limitations.
 
 
-CUDA devices can share a unified address space with the host. 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorVersionTranslation
 
- For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below).
 
+        This indicates that the driver is newer than the runtime version and returned graph node parameter information that the runtime does not understand and is unable to translate.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPitchValue
 
 
-**Supported Platforms**
+        This indicates that one or more of the pitch-related parameters passed to the API call is not within the acceptable range for pitch.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSymbol
 
-Whether or not a device supports unified addressing may be queried by calling cudaGetDeviceProperties() with the device property cudaDeviceProp::unifiedAddressing.
 
-Unified addressing is automatically enabled in 64-bit processes .
+        This indicates that the symbol name/identifier passed to the API call is not a valid name or identifier.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidHostPointer
 
 
+        This indicates that at least one host pointer passed to the API call is not a valid host pointer. [Deprecated]
 
-**Looking Up Information from Pointer Values**
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDevicePointer
 
 
-It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes()
+        This indicates that at least one device pointer passed to the API call is not a valid device pointer. [Deprecated]
 
-Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. 
 
- The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTexture
 
 
+        This indicates that the texture passed to the API call is not a valid texture.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTextureBinding
 
-**Automatic Mapping of Host Allocated Host Memory**
 
+        This indicates that the texture binding is not valid. This occurs if you call :py:obj:`~.cudaGetTextureAlignmentOffset()` with an unbound texture.
 
 
-All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidChannelDescriptor
 
-The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. 
 
+        This indicates that the channel descriptor passed to the API call is not valid. This occurs if the format is not one of the formats specified by :py:obj:`~.cudaChannelFormatKind`, or if one of the dimensions is invalid.
 
 
-Note that this is not the case for memory allocated using the flag cudaHostAllocWriteCombined, as discussed below.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidMemcpyDirection
 
 
+        This indicates that the direction of the memcpy passed to the API call is not one of the types specified by :py:obj:`~.cudaMemcpyKind`.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAddressOfConstant
 
-**Direct Access of Peer Memory**
 
+        This indicated that the user has taken the address of a constant variable, which was forbidden up until the CUDA 3.1 release. [Deprecated]
 
 
-Upon enabling direct access from a device that supports unified addressing to another peer device that supports unified addressing using cudaDeviceEnablePeerAccess() all memory allocated in the peer device using cudaMalloc() and cudaMallocPitch() will immediately be accessible by the current device. The device pointer value through which any peer's memory may be accessed in the current device is the same pointer value through which that memory may be accessed from the peer device.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureFetchFailed
 
 
+        This indicated that a texture fetch was not able to be performed. This was previously used for device emulation of texture operations. [Deprecated]
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureNotBound
 
-**Exceptions, Disjoint Addressing**
 
+        This indicated that a texture was not bound for access. This was previously used for device emulation of texture operations. [Deprecated]
 
 
-Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSynchronizationError
 
 
+        This indicated that a synchronization operation had failed. This was previously used for some device emulation functions. [Deprecated]
 
-This device address may be queried using cudaHostGetDevicePointer() when a device using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory in cudaMemcpy() and similar functions using the cudaMemcpyDefault memory direction.
 
-.. autofunction:: cuda.bindings.runtime.cudaPointerGetAttributes
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidFilterSetting
 
-Peer Device Memory Access
--------------------------
 
-This section describes the peer device memory access functions of the CUDA runtime application programming interface.
+        This indicates that a non-float texture was being accessed with linear filtering. This is not supported by CUDA.
 
-.. autofunction:: cuda.bindings.runtime.cudaDeviceCanAccessPeer
-.. autofunction:: cuda.bindings.runtime.cudaDeviceEnablePeerAccess
-.. autofunction:: cuda.bindings.runtime.cudaDeviceDisablePeerAccess
 
-OpenGL Interoperability
------------------------
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidNormSetting
 
-impl_private
 
+        This indicates that an attempt was made to read an unsupported data type as a normalized float. This is not supported by CUDA.
 
 
-This section describes the OpenGL interoperability functions of the CUDA runtime application programming interface. Note that mapping of OpenGL resources is performed with the graphics API agnostic, resource mapping interface described in Graphics Interopability.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMixedDeviceExecution
 
-.. autoclass:: cuda.bindings.runtime.cudaGLDeviceList
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListAll
+        Mixing of device and device emulation code was not allowed. [Deprecated]
 
 
-        The CUDA devices for all GPUs used by the current OpenGL context
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotYetImplemented
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListCurrentFrame
+        This indicates that the API call is not yet implemented. Production releases of CUDA will never return this error. [Deprecated]
 
 
-        The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMemoryValueTooLarge
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListNextFrame
+        This indicated that an emulated device pointer exceeded the 32-bit address range. [Deprecated]
 
 
-        The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStubLibrary
 
-.. autofunction:: cuda.bindings.runtime.cudaGLGetDevices
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterImage
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterBuffer
 
-Direct3D 9 Interoperability
----------------------------
+        This indicates that the CUDA driver that the application has loaded is a stub library. Applications that run with the stub rather than a real driver loaded will result in CUDA API returning this error.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInsufficientDriver
 
 
-Direct3D 10 Interoperability
-----------------------------
+        This indicates that the installed NVIDIA CUDA driver is older than the CUDA runtime library. This is not a supported configuration. Users should install an updated NVIDIA display driver to allow the application to run.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCallRequiresNewerDriver
 
 
-Direct3D 11 Interoperability
-----------------------------
+        This indicates that the API call requires a newer CUDA driver than the one currently installed. Users should install an updated NVIDIA CUDA driver to allow the API call to succeed.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSurface
 
 
-VDPAU Interoperability
-----------------------
+        This indicates that the surface passed to the API call is not a valid surface.
 
-This section describes the VDPAU interoperability functions of the CUDA runtime application programming interface.
 
-.. autofunction:: cuda.bindings.runtime.cudaVDPAUGetDevice
-.. autofunction:: cuda.bindings.runtime.cudaVDPAUSetVDPAUDevice
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterVideoSurface
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterOutputSurface
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateVariableName
 
-EGL Interoperability
---------------------
 
-This section describes the EGL interoperability functions of the CUDA runtime application programming interface.
+        This indicates that multiple global or constant variables (across separate CUDA source files in the application) share the same string name.
 
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsEGLRegisterImage
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerConnect
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerConnectWithFlags
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerDisconnect
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerAcquireFrame
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerReleaseFrame
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerConnect
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerDisconnect
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerPresentFrame
-.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerReturnFrame
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedEglFrame
-.. autofunction:: cuda.bindings.runtime.cudaEventCreateFromEGLSync
 
-Graphics Interoperability
--------------------------
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateTextureName
 
-This section describes the graphics interoperability functions of the CUDA runtime application programming interface.
 
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsUnregisterResource
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceSetMapFlags
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsMapResources
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsUnmapResources
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedPointer
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsSubResourceGetMappedArray
-.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedMipmappedArray
+        This indicates that multiple textures (across separate CUDA source files in the application) share the same string name.
 
-Texture Object Management
--------------------------
 
-This section describes the low level texture object management functions of the CUDA runtime application programming interface. The texture object API is only supported on devices of compute capability 3.0 or higher.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateSurfaceName
 
-.. autofunction:: cuda.bindings.runtime.cudaGetChannelDesc
-.. autofunction:: cuda.bindings.runtime.cudaCreateChannelDesc
-.. autofunction:: cuda.bindings.runtime.cudaCreateTextureObject
-.. autofunction:: cuda.bindings.runtime.cudaDestroyTextureObject
-.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectResourceDesc
-.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectTextureDesc
-.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectResourceViewDesc
 
-Surface Object Management
--------------------------
+        This indicates that multiple surfaces (across separate CUDA source files in the application) share the same string name.
 
-This section describes the low level texture object management functions of the CUDA runtime application programming interface. The surface object API is only supported on devices of compute capability 3.0 or higher.
 
-.. autofunction:: cuda.bindings.runtime.cudaCreateSurfaceObject
-.. autofunction:: cuda.bindings.runtime.cudaDestroySurfaceObject
-.. autofunction:: cuda.bindings.runtime.cudaGetSurfaceObjectResourceDesc
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDevicesUnavailable
 
-Version Management
-------------------
 
+        This indicates that all CUDA devices are busy or unavailable at the current time. Devices are often busy/unavailable due to use of :py:obj:`~.cudaComputeModeProhibited`, :py:obj:`~.cudaComputeModeExclusiveProcess`, or when long running CUDA kernels have filled up the GPU and are blocking new work from starting. They can also be unavailable due to memory constraints on a device that already has active CUDA work being performed.
 
 
-.. autofunction:: cuda.bindings.runtime.cudaDriverGetVersion
-.. autofunction:: cuda.bindings.runtime.cudaRuntimeGetVersion
-.. autofunction:: cuda.bindings.runtime.getLocalRuntimeVersion
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIncompatibleDriverContext
 
-Error Log Management Functions
-------------------------------
 
-This section describes the error log management functions of the CUDA runtime application programming interface. The Error Log Management interface will operate on both the CUDA Driver and CUDA Runtime.
+        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see :py:obj:`~.Interactions`with the CUDA Driver API" for more information.
 
-.. autoclass:: cuda.bindings.runtime.cudaLogsCallback_t
-.. autofunction:: cuda.bindings.runtime.cudaLogsRegisterCallback
-.. autofunction:: cuda.bindings.runtime.cudaLogsUnregisterCallback
-.. autofunction:: cuda.bindings.runtime.cudaLogsCurrent
-.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToFile
-.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToMemory
 
-Graph Management
-----------------
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMissingConfiguration
 
-This section describes the graph management functions of CUDA runtime application programming interface.
 
-.. autofunction:: cuda.bindings.runtime.cudaGraphCreate
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddKernelNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeCopyAttributes
-.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeGetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeSetAttribute
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemcpyNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemcpyNode1D
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeSetParams1D
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemsetNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddHostNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphHostNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphHostNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddChildGraphNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphChildGraphNodeGetGraph
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddEmptyNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddEventRecordNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphEventRecordNodeGetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphEventRecordNodeSetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddEventWaitNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphEventWaitNodeGetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphEventWaitNodeSetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddExternalSemaphoresSignalNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresSignalNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresSignalNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddExternalSemaphoresWaitNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresWaitNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresWaitNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemAllocNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemAllocNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemFreeNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphMemFreeNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGraphMemTrim
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetGraphMemAttribute
-.. autofunction:: cuda.bindings.runtime.cudaDeviceSetGraphMemAttribute
-.. autofunction:: cuda.bindings.runtime.cudaGraphClone
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeFindInClone
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetType
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetContainingGraph
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetLocalId
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetToolsId
-.. autofunction:: cuda.bindings.runtime.cudaGraphGetId
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecGetId
-.. autofunction:: cuda.bindings.runtime.cudaGraphGetNodes
-.. autofunction:: cuda.bindings.runtime.cudaGraphGetRootNodes
-.. autofunction:: cuda.bindings.runtime.cudaGraphGetEdges
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependentNodes
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphRemoveDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphDestroyNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiate
-.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithFlags
-.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecGetFlags
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecKernelNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemcpyNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemcpyNodeSetParams1D
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemsetNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecHostNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecChildGraphNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecEventRecordNodeSetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecEventWaitNodeSetEvent
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecExternalSemaphoresSignalNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecExternalSemaphoresWaitNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeSetEnabled
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetEnabled
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecUpdate
-.. autofunction:: cuda.bindings.runtime.cudaGraphUpload
-.. autofunction:: cuda.bindings.runtime.cudaGraphLaunch
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecDestroy
-.. autofunction:: cuda.bindings.runtime.cudaGraphDestroy
-.. autofunction:: cuda.bindings.runtime.cudaGraphDebugDotPrint
-.. autofunction:: cuda.bindings.runtime.cudaUserObjectCreate
-.. autofunction:: cuda.bindings.runtime.cudaUserObjectRetain
-.. autofunction:: cuda.bindings.runtime.cudaUserObjectRelease
-.. autofunction:: cuda.bindings.runtime.cudaGraphRetainUserObject
-.. autofunction:: cuda.bindings.runtime.cudaGraphReleaseUserObject
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphExecNodeSetParams
-.. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate
-.. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate_v2
+        The device function being invoked (usually via :py:obj:`~.cudaLaunchKernel()`) was not previously configured via the :py:obj:`~.cudaConfigureCall()` function.
 
-Driver Entry Point Access
--------------------------
 
-This section describes the driver entry point access functions of CUDA runtime application programming interface.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPriorLaunchFailure
 
-.. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPoint
-.. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPointByVersion
 
-Library Management
-------------------
+        This indicated that a previous kernel launch failed. This was previously used for device emulation of kernel launches. [Deprecated]
 
-This section describes the library management functions of the CUDA runtime application programming interface.
 
-.. autofunction:: cuda.bindings.runtime.cudaLibraryLoadData
-.. autofunction:: cuda.bindings.runtime.cudaLibraryLoadFromFile
-.. autofunction:: cuda.bindings.runtime.cudaLibraryUnload
-.. autofunction:: cuda.bindings.runtime.cudaLibraryGetKernel
-.. autofunction:: cuda.bindings.runtime.cudaLibraryGetGlobal
-.. autofunction:: cuda.bindings.runtime.cudaLibraryGetManaged
-.. autofunction:: cuda.bindings.runtime.cudaLibraryGetUnifiedFunction
-.. autofunction:: cuda.bindings.runtime.cudaLibraryGetKernelCount
-.. autofunction:: cuda.bindings.runtime.cudaLibraryEnumerateKernels
-.. autofunction:: cuda.bindings.runtime.cudaKernelSetAttributeForDevice
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchMaxDepthExceeded
 
-Execution Context Management
-----------------------------
 
-This section describes the execution context management functions of the CUDA runtime application programming interface.
+        This error indicates that a device runtime grid launch did not occur because the depth of the child grid would exceed the maximum supported number of nested grid launches.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFileScopedTex
 
 
+        This error indicates that a grid launch did not occur because the kernel uses file-scoped textures which are unsupported by the device runtime. Kernels launched via the device runtime only support textures created with the Texture Object API's.
 
-**Overview**
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFileScopedSurf
 
 
-A CUDA execution context cudaExecutionContext_t serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
+        This error indicates that a grid launch did not occur because the kernel uses file-scoped surfaces which are unsupported by the device runtime. Kernels launched via the device runtime only support surfaces created with the Surface Object API's.
 
-There are two primary ways today to obtain an execution context:
 
-- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSyncDepthExceeded
 
 
+        This error indicates that a call to :py:obj:`~.cudaDeviceSynchronize` made from the device runtime failed because the call was made at grid depth greater than than either the default (2 levels of grids) or user specified device limit :py:obj:`~.cudaLimitDevRuntimeSyncDepth`. To be able to synchronize on launched grids at a greater depth successfully, the maximum nested depth at which :py:obj:`~.cudaDeviceSynchronize` will be called must be specified with the :py:obj:`~.cudaLimitDevRuntimeSyncDepth` limit to the :py:obj:`~.cudaDeviceSetLimit` api before the host-side launch of a kernel using the device runtime. Keep in mind that additional levels of sync depth require the runtime to reserve large amounts of device memory that cannot be used for user allocations. Note that :py:obj:`~.cudaDeviceSynchronize` made from device runtime is only supported on devices of compute capability < 9.0.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchPendingCountExceeded
 
 
+        This error indicates that a device runtime grid launch failed because the launch would exceed the limit :py:obj:`~.cudaLimitDevRuntimePendingLaunchCount`. For this launch to proceed successfully, :py:obj:`~.cudaDeviceSetLimit` must be called to set the :py:obj:`~.cudaLimitDevRuntimePendingLaunchCount` to be higher than the upper bound of outstanding launches that can be issued to the device runtime. Keep in mind that raising the limit of pending device runtime launches will require the runtime to reserve device memory that cannot be used for user allocations.
 
-- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDeviceFunction
 
 
+        The requested device function does not exist or is not compiled for the proper device architecture.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNoDevice
 
 
+        This indicates that no CUDA-capable devices were detected by the installed CUDA driver.
 
 
-Once you have an execution context at hand, you can perform context-level operations via the CUDA Runtime APIs. This includes:
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDevice
 
-- Submitting work via streams created with cudaExecutionCtxStreamCreate.
 
+        This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceNotLicensed
 
 
+        This indicates that the device doesn't have a valid Grid License.
 
 
-- Querying context via cudaExecutionCtxGetDevResource, cudaExecutionCtxGetDevice, etc.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSoftwareValidityNotEstablished
 
 
+        By default, the CUDA runtime may perform a minimal set of self-tests, as well as CUDA driver tests, to establish the validity of both. Introduced in CUDA 11.2, this error return indicates that at least one of these tests has failed and the validity of either the runtime or the driver could not be established.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStartupFailure
 
 
+        This indicates an internal startup failure in the CUDA runtime.
 
-- Synchronizing and tracking context-level operations via cudaExecutionCtxSynchronize, cudaExecutionCtxRecordEvent, cudaExecutionCtxWaitEvent.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidKernelImage
 
 
+        This indicates that the device kernel image is invalid.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceUninitialized
 
 
-- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
+        This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :py:obj:`~.cuCtxDestroy()` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See :py:obj:`~.cuCtxGetApiVersion()` for more details.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMapBufferObjectFailed
 
 
+        This indicates that the buffer object could not be mapped.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnmapBufferObjectFailed
 
 
+        This indicates that the buffer object could not be unmapped.
 
-Note: The above APIs take in an explicit cudaExecutionContext_t handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return cudaErrorInvalidValue.
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ::CUcontext or ::CUgreenCtx.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorArrayIsMapped
 
 
+        This indicates that the specified array is currently mapped and thus cannot be destroyed.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAlreadyMapped
 
-**Lifetime of CUDA Resources**
 
+        This indicates that the resource is already mapped.
 
 
-The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of cudaDeviceReset() should be avoided to persist the lifetime of these resources.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNoKernelImageForDevice
 
 
+        This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAlreadyAcquired
 
-**APIs Operating on Current Context**
 
+        This indicates that a resource has already been acquired.
 
 
-The CUDA runtime does not provide a way to set an execution context as current. Since, the majority of the runtime APIs operate on the current context, we document below how the developer can work with these APIs.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMapped
 
 
+        This indicates that a resource is not mapped.
 
-**APIs Operating on Device Resources**
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMappedAsArray
 
 
-To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), developers are expected to call cudaSetDevice() prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using cudaExecutionCtxGetDevice().
+        This indicates that a mapped resource is not available for access as an array.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMappedAsPointer
 
 
+        This indicates that a mapped resource is not available for access as a pointer.
 
-**APIs Operating on Context Resources**
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorECCUncorrectable
 
 
-These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See cudaExecutionCtxStreamCreate for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
+        This indicates that an uncorrectable ECC error was detected during execution.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedLimit
 
 
+        This indicates that the :py:obj:`~.cudaLimit` passed to the API call is not supported by the active device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceAlreadyInUse
 
-**Green Contexts**
 
+        This indicates that a call tried to access an exclusive-thread device that is already in use by a different thread.
 
 
-Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessUnsupported
 
-Here are the broad initial steps to follow to get started:
 
-- (1) Start with an initial set of resources. For SM resources, they can be fetched via cudaDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cudaDeviceGetDevResource API.
+        This error indicates that P2P access is not supported across the given devices.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPtx
 
 
+        A PTX compilation failed. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidGraphicsContext
 
-- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cudaDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
 
+        This indicates an error with the OpenGL or DirectX context.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNvlinkUncorrectable
 
 
+        This indicates that an uncorrectable NVLink error was detected during the execution.
 
 
-- (3) Finalize the specification of resources by creating a descriptor via cudaDevResourceGenerateDesc.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorJitCompilerNotFound
 
 
+        This indicates that the PTX JIT compiler library was not found. The JIT Compiler library is used for PTX compilation. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedPtxVersion
 
 
+        This indicates that the provided PTX was compiled with an unsupported toolchain. The most common reason for this, is the PTX was generated by a compiler newer than what is supported by the CUDA driver and PTX JIT compiler.
 
-- (4) Create a green context via cudaGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorJitCompilationDisabled
 
 
+        This indicates that the JIT compilation was disabled. The JIT compilation compiles PTX. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedExecAffinity
 
 
-- (5) Create a stream via cudaExecutionCtxStreamCreate, and use it throughout your application.
+        This indicates that the provided execution affinity is not supported by the device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedDevSideSync
 
 
+        This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContained
 
 
+        This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
-SMs
 
-There are two possible partition operations - with cudaDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cudaDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cudaDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSource
 
-- On Compute Architecture 7.X, 8.X, and all Tegra SoC:
 
+        This indicates that the device kernel source is invalid.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorFileNotFound
 
 
-  - The smCount must be a multiple of 2.
+        This indicates that the file specified was not found.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSharedObjectSymbolNotFound
 
 
+        This indicates that a link to a shared object failed to resolve.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSharedObjectInitFailed
 
-  - The alignment (and default value of coscheduledSmCount) is 2.
 
+        This indicates that initialization of a shared object failed.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorOperatingSystem
 
 
+        This error indicates that an OS call failed.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceHandle
 
 
-- On Compute Architecture 9.0+:
+        This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like :py:obj:`~.cudaStream_t` and :py:obj:`~.cudaEvent_t`.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalState
 
 
+        This indicates that a resource required by the API call is not in a valid state to perform the requested operation.
 
-  - The smCount must be a multiple of 8, or coscheduledSmCount if provided.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLossyQuery
 
 
+        This indicates an attempt was made to introspect an object in a way that would discard semantically important information. This is either due to the object using funtionality newer than the API version used to introspect it or omission of optional return arguments.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSymbolNotFound
 
 
-  - The alignment (and default value of coscheduledSmCount) is 8. While the maximum value for coscheduled SM count is 32 on all Compute Architecture 9.0+, it's recommended to follow cluster size requirements. The portable cluster size and the max cluster size should be used in order to benefit from this co-scheduling.
+        This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, driver function names, texture names, and surface names.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotReady
 
 
+        This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than :py:obj:`~.cudaSuccess` (which indicates completion). Calls that may return this value include :py:obj:`~.cudaEventQuery()` and :py:obj:`~.cudaStreamQuery()`.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalAddress
 
 
+        The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchOutOfResources
 
-Workqueues
 
-For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+        This indicates that a launch did not occur because it did not have appropriate resources. Although this error is similar to :py:obj:`~.cudaErrorInvalidConfiguration`, this error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count.
 
-- ``cudaDevWorkqueueConfigScopeDeviceCtx:``\  Use all shared workqueue resources across all contexts (default driver behavior).
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchTimeout
 
 
+        This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute :py:obj:`~.cudaDevAttrKernelExecTimeout` for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchIncompatibleTexturing
 
 
-- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+        This error indicates a kernel launch that uses an incompatible texturing mode.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessAlreadyEnabled
 
 
+        This error indicates that a call to :py:obj:`~.cudaDeviceEnablePeerAccess()` is trying to re-enable peer addressing on from a context which has already had peer addressing enabled.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessNotEnabled
 
 
+        This error indicates that :py:obj:`~.cudaDeviceDisablePeerAccess()` is trying to disable peer addressing which has not been enabled yet via :py:obj:`~.cudaDeviceEnablePeerAccess()`.
 
-The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
-For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSetOnActiveProcess
 
-On Concurrency
 
-Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced``\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+        This indicates that the user has called :py:obj:`~.cudaSetValidDevices()`, :py:obj:`~.cudaSetDeviceFlags()`, :py:obj:`~.cudaD3D9SetDirect3DDevice()`, :py:obj:`~.cudaD3D10SetDirect3DDevice`, :py:obj:`~.cudaD3D11SetDirect3DDevice()`, or :py:obj:`~.cudaVDPAUSetVDPAUDevice()` after initializing the CUDA runtime by calling non-device management operations (allocating memory and launching kernels are examples of non-device management operations). This error can also be returned if using runtime/driver interoperability and there is an existing :py:obj:`~.CUcontext` active on the host thread.
 
-Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContextIsDestroyed
 
 
-- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+        This error indicates that the context current to the calling thread has been destroyed using :py:obj:`~.cuCtxDestroy`, or is a primary context which has not yet been initialized.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAssert
 
 
+        An assert triggered in device code during kernel execution. The device cannot be used again. All existing allocations are invalid. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTooManyPeers
 
-- On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future kernels running under green contexts may use and share an additional set of 2 SMs.
 
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetDevResource
-.. autofunction:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount
-.. autofunction:: cuda.bindings.runtime.cudaDevSmResourceSplit
-.. autofunction:: cuda.bindings.runtime.cudaDevResourceGenerateDesc
-.. autofunction:: cuda.bindings.runtime.cudaGreenCtxCreate
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxDestroy
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetDevResource
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetDevice
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetId
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxStreamCreate
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxSynchronize
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetDevResource
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxRecordEvent
-.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxWaitEvent
-.. autofunction:: cuda.bindings.runtime.cudaDeviceGetExecutionCtx
+        This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to :py:obj:`~.cudaEnablePeerAccess()`.
 
-C++ API Routines
-----------------
-C++-style interface built on top of CUDA runtime API.
-impl_private
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHostMemoryAlreadyRegistered
 
 
+        This error indicates that the memory range passed to :py:obj:`~.cudaHostRegister()` has already been registered.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHostMemoryNotRegistered
 
 
-This section describes the C++ high level API functions of the CUDA runtime application programming interface. To use these functions, your application needs to be compiled with the ``nvcc``\  compiler.
+        This error indicates that the pointer passed to :py:obj:`~.cudaHostUnregister()` does not correspond to any currently registered memory region.
 
 
-Interactions with the CUDA Driver API
--------------------------------------
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHardwareStackError
 
-This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
 
+        Device encountered an error in the call stack during kernel execution, possibly due to stack corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalInstruction
 
 
-**Execution Contexts**
+        The device encountered an illegal instruction during kernel execution This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMisalignedAddress
 
-The CUDA Runtime provides cudaExecutionContext_t as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
 
-There are two primary ways to obtain an execution context:
+        The device encountered a load or store instruction on a memory address which is not aligned. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
-- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidAddressSpace
 
 
+        While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPc
 
 
-- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
+        The device encountered an invalid program counter. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFailure
 
 
+        An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. Less common cases can be system specific - more information about these cases can be found in the system specific user guide. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCooperativeLaunchTooLarge
 
 
+        This error indicates that the number of blocks launched per grid for a kernel that was launched via either :py:obj:`~.cudaLaunchCooperativeKernel` exceeds the maximum number of blocks as allowed by :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` or :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :py:obj:`~.cudaDevAttrMultiProcessorCount`.
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ::CUcontext or ::CUgreenCtx.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTensorMemoryLeak
 
 
+        An exception occurred on the device while exiting a kernel using tensor memory: the tensor memory was not completely deallocated. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
-**Primary Context (aka Device Execution Context)**
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotPermitted
 
 
+        This error indicates the attempted operation is not permitted.
 
-The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to cudaDeviceGetExecutionCtx(). There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
 
-From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotSupported
 
-Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ::cuCtxSetCurrent()) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
 
+        This error indicates the attempted operation is not supported on the current system or device.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSystemNotReady
 
 
-**Initialization and Tear-Down**
+        This error indicates that the system is not yet ready to start any CUDA work. To continue using CUDA, verify the system configuration is in a valid state and all required driver daemons are actively running. More information about this error can be found in the system specific user guide.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSystemDriverMismatch
 
-Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ::CUcontext which is current to the calling host thread. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
 
-The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
+        This error indicates that there is a mismatch between the versions of the display driver and the CUDA driver. Refer to the compatibility documentation for supported versions.
 
-The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
 
-Primary contexts will remain active until they are explicitly deinitialized using cudaDeviceReset(). The function cudaDeviceReset() will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCompatNotSupportedOnDevice
 
-Note that primary contexts are shared resources. It is recommended that the primary context not be reset except just before exit or to recover from an unspecified launch failure.
 
+        This error indicates that the system was upgraded to run with forward compatibility but the visible hardware detected by CUDA does not support this configuration. Refer to the compatibility documentation for the supported hardware matrix or ensure that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES environment variable.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsConnectionFailed
 
 
-**CUcontext Interoperability**
+        This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsRpcFailure
 
-Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
-If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
+        This error indicates that the remote procedural call between the MPS server and the MPS client failed.
 
-The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary CUcontext is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
 
-All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsServerNotReady
 
-Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
 
+        This error indicates that the MPS server is not ready to accept new MPS client requests. This error can be returned when the MPS server is in the process of recovering from a fatal failure.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsMaxClientsReached
 
 
-**Interactions between CUstream and cudaStream_t**
+        This error indicates that the hardware resources required to create MPS client have been exhausted.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsMaxConnectionsReached
 
-The types ::CUstream and cudaStream_t are identical and may be used interchangeably.
 
+        This error indicates the the hardware resources required to device connections have been exhausted.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsClientTerminated
 
 
-**Interactions between CUevent and cudaEvent_t**
+        This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCdpNotSupported
 
-The types ::CUevent and cudaEvent_t are identical and may be used interchangeably.
 
+        This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCdpVersionMismatch
 
 
-**Interactions between CUarray and cudaArray_t**
+        This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnsupported
 
-The types ::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
+        The operation is not permitted when the stream is capturing.
 
-In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureInvalidated
 
 
+        The current capture sequence on the stream has been invalidated due to a previous error.
 
 
-**Interactions between CUgraphicsResource and cudaGraphicsResource_t**
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureMerge
 
 
+        The operation would have resulted in a merge of two independent capture sequences.
 
-The types ::CUgraphicsResource and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource to a cudaGraphicsResource_t.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnmatched
 
-In order to use a cudaGraphicsResource_t in a CUDA Driver API function which takes a ::CUgraphicsResource, it is necessary to explicitly cast the cudaGraphicsResource_t to a ::CUgraphicsResource.
 
+        The capture was not initiated in this stream.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnjoined
 
 
-**Interactions between CUtexObject and cudaTextureObject_t**
+        The capture sequence contains a fork that was not joined to the primary stream.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureIsolation
 
-The types ::CUtexObject and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUtexObject in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ::CUtexObject to a cudaTextureObject_t.
+        A dependency would have been created which crosses the capture sequence boundary. Only implicit in-stream ordering dependencies are allowed to cross the boundary.
 
-In order to use a cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject, it is necessary to explicitly cast the cudaTextureObject_t to a ::CUtexObject.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureImplicit
 
 
+        The operation would have resulted in a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy.
 
 
-**Interactions between CUsurfObject and cudaSurfaceObject_t**
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCapturedEvent
 
 
+        The operation is not permitted on an event which was last recorded in a capturing stream.
 
-The types ::CUsurfObject and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ::CUsurfObject to a cudaSurfaceObject_t.
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureWrongThread
 
-In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject, it is necessary to explicitly cast the cudaSurfaceObject_t to a ::CUsurfObject.
 
+        A stream capture sequence not initiated with the :py:obj:`~.cudaStreamCaptureModeRelaxed` argument to :py:obj:`~.cudaStreamBeginCapture` was passed to :py:obj:`~.cudaStreamEndCapture` in a different thread.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTimeout
 
 
-**Interactions between CUfunction and cudaFunction_t**
+        This indicates that the wait operation has timed out.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorGraphExecUpdateFailure
 
-The types ::CUfunction and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction, it is necessary to explicitly cast the cudaFunction_t to a ::CUfunction.
+        This error indicates that the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorExternalDevice
 
 
+        This indicates that an error has occurred in a device outside of GPU. It can be a synchronous error w.r.t. CUDA API or an asynchronous error from the external device. In case of asynchronous error, it means that if cuda was waiting for an external device's signal before consuming shared data, the external device signaled an error indicating that the data is not valid for consumption. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched. In case of synchronous error, it means that one or more external devices have encountered an error and cannot complete the operation.
 
-**Interactions between CUkernel and cudaKernel_t**
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidClusterSize
 
 
-The types ::CUkernel and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
+        This indicates that a kernel launch error has occurred due to cluster misconfiguration.
 
-In order to use a cudaKernel_t in a CUDA Driver API function which takes a ::CUkernel, it is necessary to explicitly cast the cudaKernel_t to a ::CUkernel.
 
-.. autofunction:: cuda.bindings.runtime.cudaGetKernel
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorFunctionNotLoaded
 
-Data types used by CUDA Runtime
--------------------------------
 
+        Indiciates a function handle is not loaded when calling an API that requires a loaded function.
 
 
-.. autoclass:: cuda.bindings.runtime.cudaTextureDesc
-.. autoclass:: cuda.bindings.runtime.cudaEglPlaneDesc_st
-.. autoclass:: cuda.bindings.runtime.cudaEglFrame_st
-.. autoclass:: cuda.bindings.runtime.cudaChannelFormatDesc
-.. autoclass:: cuda.bindings.runtime.cudaArraySparseProperties
-.. autoclass:: cuda.bindings.runtime.cudaArrayMemoryRequirements
-.. autoclass:: cuda.bindings.runtime.cudaPitchedPtr
-.. autoclass:: cuda.bindings.runtime.cudaExtent
-.. autoclass:: cuda.bindings.runtime.cudaPos
-.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DParms
-.. autoclass:: cuda.bindings.runtime.cudaMemcpyNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DPeerParms
-.. autoclass:: cuda.bindings.runtime.cudaMemsetParams
-.. autoclass:: cuda.bindings.runtime.cudaMemsetParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaAccessPolicyWindow
-.. autoclass:: cuda.bindings.runtime.cudaHostNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaHostNodeParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaResourceDesc
-.. autoclass:: cuda.bindings.runtime.cudaResourceViewDesc
-.. autoclass:: cuda.bindings.runtime.cudaPointerAttributes
-.. autoclass:: cuda.bindings.runtime.cudaFuncAttributes
-.. autoclass:: cuda.bindings.runtime.cudaMemLocation
-.. autoclass:: cuda.bindings.runtime.cudaMemAccessDesc
-.. autoclass:: cuda.bindings.runtime.cudaMemPoolProps
-.. autoclass:: cuda.bindings.runtime.cudaMemPoolPtrExportData
-.. autoclass:: cuda.bindings.runtime.cudaMemAllocNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaMemAllocNodeParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaMemFreeNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaMemcpyAttributes
-.. autoclass:: cuda.bindings.runtime.cudaOffset3D
-.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DOperand
-.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DBatchOp
-.. autoclass:: cuda.bindings.runtime.CUuuid_st
-.. autoclass:: cuda.bindings.runtime.cudaDeviceProp
-.. autoclass:: cuda.bindings.runtime.cudaIpcEventHandle_st
-.. autoclass:: cuda.bindings.runtime.cudaIpcMemHandle_st
-.. autoclass:: cuda.bindings.runtime.cudaMemFabricHandle_st
-.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryHandleDesc
-.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryBufferDesc
-.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryMipmappedArrayDesc
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreHandleDesc
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalParams
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitParams
-.. autoclass:: cuda.bindings.runtime.cudaDevSmResource
-.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigResource
-.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueResource
-.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams_st
-.. autoclass:: cuda.bindings.runtime.cudaDevResource_st
-.. autoclass:: cuda.bindings.runtime.cudalibraryHostUniversalFunctionAndDataTable
-.. autoclass:: cuda.bindings.runtime.cudaKernelNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaKernelNodeParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalNodeParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitNodeParamsV2
-.. autoclass:: cuda.bindings.runtime.cudaConditionalNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaChildGraphNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaEventRecordNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaEventWaitNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaGraphNodeParams
-.. autoclass:: cuda.bindings.runtime.cudaGraphEdgeData_st
-.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateParams_st
-.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResultInfo_st
-.. autoclass:: cuda.bindings.runtime.cudaGraphKernelNodeUpdate
-.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomainMap_st
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeValue
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttribute_st
-.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationInfo
-.. autoclass:: cuda.bindings.runtime.cudaTextureAddressMode
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceType
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeWrap
 
+        This error indicates one or more resources passed in are not valid resource types for the operation.
 
-        Wrapping address mode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceConfiguration
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeClamp
 
+        This error indicates one or more resources are insufficient or non-applicable for the operation.
 
-        Clamp to edge address mode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamDetached
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeMirror
 
+        This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
 
-        Mirror address mode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnknown
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeBorder
 
+        This indicates that an unknown internal error has occurred.
 
-        Border address mode
 
-.. autoclass:: cuda.bindings.runtime.cudaTextureFilterMode
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorApiFailureBase
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureFilterMode.cudaFilterModePoint
+.. autoclass:: cuda.bindings.runtime.cudaChannelFormatKind
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSigned
 
-        Point filter mode
 
+        Signed channel format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureFilterMode.cudaFilterModeLinear
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
 
-        Linear filter mode
 
-.. autoclass:: cuda.bindings.runtime.cudaTextureReadMode
+        Unsigned channel format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureReadMode.cudaReadModeElementType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindFloat
 
-        Read texture as specified element type
 
+        Float channel format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaTextureReadMode.cudaReadModeNormalizedFloat
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindNone
 
-        Read texture as normalized float
 
-.. autoclass:: cuda.bindings.runtime.cudaSurfaceBoundaryMode
+        No channel format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeZero
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindNV12
 
-        Zero boundary mode
 
+        Unsigned 8-bit integers, planar 4:2:0 YUV format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeClamp
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X1
 
-        Clamp boundary mode
 
+        1 channel unsigned 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeTrap
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X2
 
-        Trap boundary mode
 
-.. autoclass:: cuda.bindings.runtime.cudaSurfaceFormatMode
+        2 channel unsigned 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceFormatMode.cudaFormatModeForced
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X4
 
-        Forced format mode
 
+        4 channel unsigned 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceFormatMode.cudaFormatModeAuto
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X1
 
-        Auto format mode
 
-.. autoclass:: cuda.bindings.runtime.cudaEglFrameType
+        1 channel unsigned 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglFrameType.cudaEglFrameTypeArray
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X2
 
-        Frame type CUDA array
 
+        2 channel unsigned 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglFrameType.cudaEglFrameTypePitch
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X4
 
-        Frame type CUDA pointer
 
-.. autoclass:: cuda.bindings.runtime.cudaEglResourceLocationFlags
+        4 channel unsigned 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglResourceLocationFlags.cudaEglResourceLocationSysmem
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X1
 
-        Resource location sysmem
 
+        1 channel signed 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglResourceLocationFlags.cudaEglResourceLocationVidmem
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X2
 
-        Resource location vidmem
 
-.. autoclass:: cuda.bindings.runtime.cudaEglColorFormat
+        2 channel signed 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X4
 
-        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        4 channel signed 8-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X1
 
-        Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar.
 
+        1 channel signed 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422Planar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X2
 
-        Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height.
 
+        2 channel signed 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X4
 
-        Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar.
 
+        4 channel signed 16-bit normalized integer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatARGB
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed1
 
-        R/G/B/A four channels in one surface with BGRA byte ordering.
 
+        4 channel unsigned normalized block-compressed (BC1 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatRGBA
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed1SRGB
 
-        R/G/B/A four channels in one surface with ABGR byte ordering.
 
+        4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatL
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed2
 
-        single luminance channel in one surface.
 
+        4 channel unsigned normalized block-compressed (BC2 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatR
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed2SRGB
 
-        single color channel in one surface.
 
+        4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444Planar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed3
 
-        Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height.
 
+        4 channel unsigned normalized block-compressed (BC3 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed3SRGB
 
-        Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar.
 
+        4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUYV422
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed4
 
-        Y, U, V in one surface, interleaved as UYVY in one channel.
 
+        1 channel unsigned normalized block-compressed (BC4 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY422
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed4
 
-        Y, U, V in one surface, interleaved as YUYV in one channel.
 
+        1 channel signed normalized block-compressed (BC4 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatABGR
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed5
 
-        R/G/B/A four channels in one surface with RGBA byte ordering.
 
+        2 channel unsigned normalized block-compressed (BC5 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBGRA
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed5
 
-        R/G/B/A four channels in one surface with ARGB byte ordering.
 
+        2 channel signed normalized block-compressed (BC5 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatA
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed6H
 
-        Alpha color format - one channel in one surface.
 
+        3 channel unsigned half-float block-compressed (BC6H compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatRG
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed6H
 
-        R/G color format - two channels in one surface with GR byte ordering
 
+        3 channel signed half-float block-compressed (BC6H compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatAYUV
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed7
 
-        Y, U, V, A four channels in one surface, interleaved as VUYA.
 
+        4 channel unsigned normalized block-compressed (BC7 compression) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed7SRGB
 
-        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
+        4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized1010102
 
-        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
 
+        4 channel unsigned normalized (10-bit, 10-bit, 10-bit, 2-bit) format
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar
+.. autoclass:: cuda.bindings.runtime.cudaMemoryType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeUnregistered
 
-        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        Unregistered memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeHost
 
-        Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
+        Host memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeDevice
 
-        Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        Device memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeManaged
 
-        Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
+        Managed memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar
+.. autoclass:: cuda.bindings.runtime.cudaMemcpyKind
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyHostToHost
 
-        Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        Host -> Host
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatVYUY_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice
 
-        Extended Range Y, U, V in one surface, interleaved as YVYU in one channel.
 
+        Host -> Device
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost
 
-        Extended Range Y, U, V in one surface, interleaved as YUYV in one channel.
 
+        Device -> Host
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUYV_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice
 
-        Extended Range Y, U, V in one surface, interleaved as UYVY in one channel.
 
+        Device -> Device
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVYU_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDefault
 
-        Extended Range Y, U, V in one surface, interleaved as VYUY in one channel.
 
+        Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUVA_ER
+.. autoclass:: cuda.bindings.runtime.cudaAccessProperty
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyNormal
 
-        Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY.
 
+        Normal cache persistence.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatAYUV_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyStreaming
 
-        Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA.
 
+        Streaming access is less likely to persit from cache.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444Planar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyPersisting
 
-        Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height.
 
+        Persisting access is more likely to persist in cache.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422Planar_ER
+.. autoclass:: cuda.bindings.runtime.cudaStreamCaptureStatus
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone
 
-        Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height.
 
+        Stream is not capturing
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
 
-        Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        Stream is actively capturing
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444SemiPlanar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusInvalidated
 
-        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height.
 
+        Stream is part of a capture sequence that has been invalidated, but not terminated
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422SemiPlanar_ER
+.. autoclass:: cuda.bindings.runtime.cudaStreamCaptureMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
 
-        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed
 
-        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
+.. autoclass:: cuda.bindings.runtime.cudaSynchronizationPolicy
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyAuto
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444Planar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicySpin
 
-        Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyYield
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422Planar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyBlockingSync
 
-        Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height.
+.. autoclass:: cuda.bindings.runtime.cudaClusterSchedulingPolicy
 
+    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicyDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_ER
 
+        the default policy
 
-        Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicySpread
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444SemiPlanar_ER
 
+        spread the blocks within a cluster to the SMs
 
-        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicyLoadBalancing
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422SemiPlanar_ER
 
+        allow the hardware to load-balance the blocks in a cluster to the SMs
 
-        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
+.. autoclass:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamAddCaptureDependencies
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_ER
 
+        Add new nodes to the dependency set
 
-        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerRGGB
 
+        Replace the dependency set with the new nodes
 
-        Bayer format - one channel in one surface with interleaved RGGB ordering.
+.. autoclass:: cuda.bindings.runtime.cudaUserObjectFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaUserObjectFlags.cudaUserObjectNoDestructorSync
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerBGGR
 
+        Indicates the destructor execution is not synchronized by any CUDA handle.
 
-        Bayer format - one channel in one surface with interleaved BGGR ordering.
+.. autoclass:: cuda.bindings.runtime.cudaUserObjectRetainFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaUserObjectRetainFlags.cudaGraphUserObjectMove
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerGRBG
 
+        Transfer references from the caller rather than creating new references.
 
-        Bayer format - one channel in one surface with interleaved GRBG ordering.
+.. autoclass:: cuda.bindings.runtime.cudaHostTaskSyncMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaHostTaskSyncMode.cudaHostTaskBlocking
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerGBRG
 
+    .. autoattribute:: cuda.bindings.runtime.cudaHostTaskSyncMode.cudaHostTaskSpinWait
 
-        Bayer format - one channel in one surface with interleaved GBRG ordering.
+.. autoclass:: cuda.bindings.runtime.cudaGraphicsRegisterFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10RGGB
 
+        Default
 
-        Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsReadOnly
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10BGGR
 
+        CUDA will not write to this resource
 
-        Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10GRBG
 
+        CUDA will only write to and will not read from this resource
 
-        Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsSurfaceLoadStore
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10GBRG
 
+        CUDA will bind this resource to a surface reference
 
-        Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsTextureGather
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12RGGB
 
+        CUDA will perform texture gather operations on this resource
 
-        Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op.
+.. autoclass:: cuda.bindings.runtime.cudaGraphicsMapFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12BGGR
 
+        Default; Assume resource can be read/written
 
-        Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsReadOnly
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12GRBG
 
+        CUDA will not write to this resource
 
-        Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsWriteDiscard
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12GBRG
 
+        CUDA will only write to and will not read from this resource
 
-        Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op.
+.. autoclass:: cuda.bindings.runtime.cudaGraphicsCubeFace
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveX
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14RGGB
 
+        Positive X face of cubemap
 
-        Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeX
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14BGGR
 
+        Negative X face of cubemap
 
-        Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14GRBG
 
+        Positive Y face of cubemap
 
-        Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14GBRG
 
+        Negative Y face of cubemap
 
-        Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveZ
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20RGGB
 
+        Positive Z face of cubemap
 
-        Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeZ
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20BGGR
 
+        Negative Z face of cubemap
 
-        Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op.
+.. autoclass:: cuda.bindings.runtime.cudaResourceType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeArray
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20GRBG
 
+        Array resource
 
-        Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeMipmappedArray
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20GBRG
 
+        Mipmapped array resource
 
-        Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeLinear
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444Planar
 
+        Linear resource
 
-        Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypePitch2D
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422Planar
 
+        Pitch 2D resource
 
-        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height.
+.. autoclass:: cuda.bindings.runtime.cudaResourceViewFormat
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar
 
+        No resource view format (use underlying resource format)
 
-        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspRGGB
 
+        1 channel unsigned 8-bit integers
 
-        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspBGGR
 
+        2 channel unsigned 8-bit integers
 
-        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspGRBG
 
+        4 channel unsigned 8-bit integers
 
-        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspGBRG
 
+        1 channel signed 8-bit integers
 
-        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerBCCR
 
+        2 channel signed 8-bit integers
 
-        Bayer format - one channel in one surface with interleaved BCCR ordering.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerRCCB
 
+        4 channel signed 8-bit integers
 
-        Bayer format - one channel in one surface with interleaved RCCB ordering.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerCRBC
 
+        1 channel unsigned 16-bit integers
 
-        Bayer format - one channel in one surface with interleaved CRBC ordering.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerCBRC
 
+        2 channel unsigned 16-bit integers
 
-        Bayer format - one channel in one surface with interleaved CBRC ordering.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10CCCC
 
+        4 channel unsigned 16-bit integers
 
-        Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12BCCR
 
+        1 channel signed 16-bit integers
 
-        Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12RCCB
 
+        2 channel signed 16-bit integers
 
-        Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CRBC
 
+        4 channel signed 16-bit integers
 
-        Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CBRC
 
+        1 channel unsigned 32-bit integers
 
-        Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CCCC
 
+        2 channel unsigned 32-bit integers
 
-        Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY
 
+        4 channel unsigned 32-bit integers
 
-        Color format for single Y plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_2020
 
+        1 channel signed 32-bit integers
 
-        Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_2020
 
+        2 channel signed 32-bit integers
 
-        Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_2020
 
+        4 channel signed 32-bit integers
 
-        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_2020
 
+        1 channel 16-bit floating point
 
-        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_709
 
+        2 channel 16-bit floating point
 
-        Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_709
 
+        4 channel 16-bit floating point
 
-        Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_709
 
+        1 channel 32-bit floating point
 
-        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_709
 
+        2 channel 32-bit floating point
 
-        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_709
 
+        4 channel 32-bit floating point
 
-        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_2020
 
+        Block compressed 1
 
-        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed2
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar_2020
 
+        Block compressed 2
 
-        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed3
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar
 
+        Block compressed 3
 
-        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar_709
 
+        Block compressed 4 unsigned
 
-        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed4
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY_ER
 
+        Block compressed 4 signed
 
-        Extended Range Color format for single Y plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed5
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY_709_ER
 
+        Block compressed 5 unsigned
 
-        Extended Range Color format for single Y plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed5
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10_ER
 
+        Block compressed 5 signed
 
-        Extended Range Color format for single Y10 plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed6H
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10_709_ER
 
+        Block compressed 6 unsigned half-float
 
-        Extended Range Color format for single Y10 plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed6H
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12_ER
 
+        Block compressed 6 signed half-float
 
-        Extended Range Color format for single Y12 plane.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed7
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12_709_ER
 
+        Block compressed 7
 
-        Extended Range Color format for single Y12 plane.
+.. autoclass:: cuda.bindings.runtime.cudaFuncAttribute
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeMaxDynamicSharedMemorySize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUVA
 
+        Maximum dynamic shared memory size
 
-        Y, U, V, A four channels in one surface, interleaved as AVUY.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributePreferredSharedMemoryCarveout
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVYU
 
+        Preferred shared memory-L1 cache split
 
-        Y, U, V in one surface, interleaved as YVYU in one channel.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeClusterDimMustBeSet
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatVYUY
 
+        Indicator to enforce valid cluster dimension specification on kernel launch
 
-        Y, U, V in one surface, interleaved as VYUY in one channel.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_ER
 
+        Required cluster width
 
-        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER
 
+        Required cluster height
 
-        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterDepth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar_ER
 
+        Required cluster depth
 
-        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeNonPortableClusterSizeAllowed
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER
 
+        Whether non-portable cluster scheduling policy is supported
 
-        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeClusterSchedulingPolicyPreference
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar_ER
 
+        Required cluster scheduling policy preference
 
-        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER
+.. autoclass:: cuda.bindings.runtime.cudaFuncCache
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferNone
 
-        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
+        Default function cache configuration, no preference
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferShared
 
-        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
+        Prefer larger shared memory and smaller L1 cache
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferL1
 
-        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
+        Prefer larger L1 cache and smaller shared memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY709
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferEqual
 
-        Y, U, V in one surface, interleaved as UYVY in one channel.
 
+        Prefer equal size L1 cache and shared memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY709_ER
+.. autoclass:: cuda.bindings.runtime.cudaSharedMemConfig
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeDefault
 
-        Extended Range Y, U, V in one surface, interleaved as UYVY in one channel.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeFourByte
 
-    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY2020
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeEightByte
 
-        Y, U, V in one surface, interleaved as UYVY in one channel.
+.. autoclass:: cuda.bindings.runtime.cudaSharedCarveout
 
-.. autoclass:: cuda.bindings.runtime.cudaError_t
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaSuccess
 
+        No preference for shared memory or L1 (default)
 
-        The API call returned with no errors. In the case of query calls, this also means that the operation being queried is complete (see :py:obj:`~.cudaEventQuery()` and :py:obj:`~.cudaStreamQuery()`).
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutMaxShared
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidValue
 
+        Prefer maximum available shared memory, minimum L1 cache
 
-        This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutMaxL1
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMemoryAllocation
 
+        Prefer maximum available L1 cache, minimum shared memory
 
-        The API call failed because it was unable to allocate enough memory or other resources to perform the requested operation.
+.. autoclass:: cuda.bindings.runtime.cudaComputeMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInitializationError
 
+        Default compute mode (Multiple threads can use :py:obj:`~.cudaSetDevice()` with this device)
 
-        The API call failed because the CUDA driver and runtime could not be initialized.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeExclusive
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCudartUnloading
 
+        Compute-exclusive-thread mode (Only one thread in one process will be able to use :py:obj:`~.cudaSetDevice()` with this device)
 
-        This indicates that a CUDA Runtime API call cannot be executed because it is being called during process shut down, at a point in time after CUDA driver has been unloaded.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeProhibited
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerDisabled
 
+        Compute-prohibited mode (No threads can use :py:obj:`~.cudaSetDevice()` with this device)
 
-        This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeExclusiveProcess
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerNotInitialized
 
+        Compute-exclusive-process mode (Many threads in one process will be able to use :py:obj:`~.cudaSetDevice()` with this device)
 
-        [Deprecated]
+.. autoclass:: cuda.bindings.runtime.cudaLimit
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitStackSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerAlreadyStarted
 
+        GPU thread stack size
 
-        [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitPrintfFifoSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorProfilerAlreadyStopped
 
+        GPU printf FIFO size
 
-        [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitMallocHeapSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidConfiguration
 
+        GPU malloc heap size
 
-        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device limitations.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitDevRuntimeSyncDepth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorVersionTranslation
 
+        GPU device runtime synchronize depth
 
-        This indicates that the driver is newer than the runtime version and returned graph node parameter information that the runtime does not understand and is unable to translate.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitDevRuntimePendingLaunchCount
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPitchValue
 
+        GPU device runtime pending launch count
 
-        This indicates that one or more of the pitch-related parameters passed to the API call is not within the acceptable range for pitch.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitMaxL2FetchGranularity
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSymbol
 
+        A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint
 
-        This indicates that the symbol name/identifier passed to the API call is not a valid name or identifier.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitPersistingL2CacheSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidHostPointer
 
+        A size in bytes for L2 persisting lines cache size
 
-        This indicates that at least one host pointer passed to the API call is not a valid host pointer. [Deprecated]
+.. autoclass:: cuda.bindings.runtime.cudaMemoryAdvise
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetReadMostly
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDevicePointer
 
+        Data will mostly be read and only occassionally be written to
 
-        This indicates that at least one device pointer passed to the API call is not a valid device pointer. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetReadMostly
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTexture
 
+        Undo the effect of :py:obj:`~.cudaMemAdviseSetReadMostly`
 
-        This indicates that the texture passed to the API call is not a valid texture.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetPreferredLocation
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTextureBinding
 
+        Set the preferred location for the data as the specified device
 
-        This indicates that the texture binding is not valid. This occurs if you call :py:obj:`~.cudaGetTextureAlignmentOffset()` with an unbound texture.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetPreferredLocation
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidChannelDescriptor
 
+        Clear the preferred location for the data
 
-        This indicates that the channel descriptor passed to the API call is not valid. This occurs if the format is not one of the formats specified by :py:obj:`~.cudaChannelFormatKind`, or if one of the dimensions is invalid.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidMemcpyDirection
 
+        Data will be accessed by the specified device, so prevent page faults as much as possible
 
-        This indicates that the direction of the memcpy passed to the API call is not one of the types specified by :py:obj:`~.cudaMemcpyKind`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetAccessedBy
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAddressOfConstant
 
+        Let the Unified Memory subsystem decide on the page faulting policy for the specified device
 
-        This indicated that the user has taken the address of a constant variable, which was forbidden up until the CUDA 3.1 release. [Deprecated]
+.. autoclass:: cuda.bindings.runtime.cudaMemRangeAttribute
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeReadMostly
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureFetchFailed
 
+        Whether the range will mostly be read and only occassionally be written to
 
-        This indicated that a texture fetch was not able to be performed. This was previously used for device emulation of texture operations. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocation
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureNotBound
 
+        The preferred location of the range
 
-        This indicated that a texture was not bound for access. This was previously used for device emulation of texture operations. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeAccessedBy
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSynchronizationError
 
+        Memory range has :py:obj:`~.cudaMemAdviseSetAccessedBy` set for specified device
 
-        This indicated that a synchronization operation had failed. This was previously used for some device emulation functions. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocation
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidFilterSetting
 
+        The last location to which the range was prefetched
 
-        This indicates that a non-float texture was being accessed with linear filtering. This is not supported by CUDA.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocationType
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidNormSetting
 
+        The preferred location type of the range
 
-        This indicates that an attempt was made to read an unsupported data type as a normalized float. This is not supported by CUDA.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocationId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMixedDeviceExecution
 
+        The preferred location id of the range
 
-        Mixing of device and device emulation code was not allowed. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocationType
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotYetImplemented
 
+        The last location type to which the range was prefetched
 
-        This indicates that the API call is not yet implemented. Production releases of CUDA will never return this error. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocationId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMemoryValueTooLarge
 
+        The last location id to which the range was prefetched
 
-        This indicated that an emulated device pointer exceeded the 32-bit address range. [Deprecated]
+.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions.cudaFlushGPUDirectRDMAWritesOptionHost
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStubLibrary
 
+        :py:obj:`~.cudaDeviceFlushGPUDirectRDMAWrites()` and its CUDA Driver API counterpart are supported on the device.
 
-        This indicates that the CUDA driver that the application has loaded is a stub library. Applications that run with the stub rather than a real driver loaded will result in CUDA API returning this error.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions.cudaFlushGPUDirectRDMAWritesOptionMemOps
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInsufficientDriver
 
+        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the CUDA device.
 
-        This indicates that the installed NVIDIA CUDA driver is older than the CUDA runtime library. This is not a supported configuration. Users should install an updated NVIDIA display driver to allow the application to run.
+.. autoclass:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCallRequiresNewerDriver
 
+        The device does not natively support ordering of GPUDirect RDMA writes. :py:obj:`~.cudaFlushGPUDirectRDMAWrites()` can be leveraged if supported.
 
-        This indicates that the API call requires a newer CUDA driver than the one currently installed. Users should install an updated NVIDIA CUDA driver to allow the API call to succeed.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingOwner
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSurface
 
+        Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not.
 
-        This indicates that the surface passed to the API call is not a valid surface.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingAllDevices
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateVariableName
 
+        Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device.
 
-        This indicates that multiple global or constant variables (across separate CUDA source files in the application) share the same string name.
+.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope.cudaFlushGPUDirectRDMAWritesToOwner
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateTextureName
 
+        Blocks until remote writes are visible to the CUDA device context owning the data.
 
-        This indicates that multiple textures (across separate CUDA source files in the application) share the same string name.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope.cudaFlushGPUDirectRDMAWritesToAllDevices
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDuplicateSurfaceName
 
+        Blocks until remote writes are visible to all CUDA device contexts.
 
-        This indicates that multiple surfaces (across separate CUDA source files in the application) share the same string name.
+.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesTarget
 
+    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesTarget.cudaFlushGPUDirectRDMAWritesTargetCurrentDevice
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDevicesUnavailable
 
+        Sets the target for :py:obj:`~.cudaDeviceFlushGPUDirectRDMAWrites()` to the currently active CUDA device context.
 
-        This indicates that all CUDA devices are busy or unavailable at the current time. Devices are often busy/unavailable due to use of :py:obj:`~.cudaComputeModeProhibited`, :py:obj:`~.cudaComputeModeExclusiveProcess`, or when long running CUDA kernels have filled up the GPU and are blocking new work from starting. They can also be unavailable due to memory constraints on a device that already has active CUDA work being performed.
+.. autoclass:: cuda.bindings.runtime.cudaDeviceAttr
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIncompatibleDriverContext
 
+        Maximum number of threads per block
 
-        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see :py:obj:`~.Interactions`with the CUDA Driver API" for more information.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimX
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMissingConfiguration
 
+        Maximum block dimension X
 
-        The device function being invoked (usually via :py:obj:`~.cudaLaunchKernel()`) was not previously configured via the :py:obj:`~.cudaConfigureCall()` function.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPriorLaunchFailure
 
+        Maximum block dimension Y
 
-        This indicated that a previous kernel launch failed. This was previously used for device emulation of kernel launches. [Deprecated]
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimZ
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchMaxDepthExceeded
 
+        Maximum block dimension Z
 
-        This error indicates that a device runtime grid launch did not occur because the depth of the child grid would exceed the maximum supported number of nested grid launches.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimX
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFileScopedTex
 
+        Maximum grid dimension X
 
-        This error indicates that a grid launch did not occur because the kernel uses file-scoped textures which are unsupported by the device runtime. Kernels launched via the device runtime only support textures created with the Texture Object API's.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFileScopedSurf
 
+        Maximum grid dimension Y
 
-        This error indicates that a grid launch did not occur because the kernel uses file-scoped surfaces which are unsupported by the device runtime. Kernels launched via the device runtime only support surfaces created with the Surface Object API's.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimZ
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSyncDepthExceeded
 
+        Maximum grid dimension Z
 
-        This error indicates that a call to :py:obj:`~.cudaDeviceSynchronize` made from the device runtime failed because the call was made at grid depth greater than than either the default (2 levels of grids) or user specified device limit :py:obj:`~.cudaLimitDevRuntimeSyncDepth`. To be able to synchronize on launched grids at a greater depth successfully, the maximum nested depth at which :py:obj:`~.cudaDeviceSynchronize` will be called must be specified with the :py:obj:`~.cudaLimitDevRuntimeSyncDepth` limit to the :py:obj:`~.cudaDeviceSetLimit` api before the host-side launch of a kernel using the device runtime. Keep in mind that additional levels of sync depth require the runtime to reserve large amounts of device memory that cannot be used for user allocations. Note that :py:obj:`~.cudaDeviceSynchronize` made from device runtime is only supported on devices of compute capability < 9.0.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlock
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchPendingCountExceeded
 
+        Maximum shared memory available per block in bytes
 
-        This error indicates that a device runtime grid launch failed because the launch would exceed the limit :py:obj:`~.cudaLimitDevRuntimePendingLaunchCount`. For this launch to proceed successfully, :py:obj:`~.cudaDeviceSetLimit` must be called to set the :py:obj:`~.cudaLimitDevRuntimePendingLaunchCount` to be higher than the upper bound of outstanding launches that can be issued to the device runtime. Keep in mind that raising the limit of pending device runtime launches will require the runtime to reserve device memory that cannot be used for user allocations.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTotalConstantMemory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDeviceFunction
 
+        Memory available on device for constant variables in a CUDA C kernel in bytes
 
-        The requested device function does not exist or is not compiled for the proper device architecture.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrWarpSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNoDevice
 
+        Warp size in threads
 
-        This indicates that no CUDA-capable devices were detected by the installed CUDA driver.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxPitch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDevice
 
+        Maximum pitch in bytes allowed by memory copies
 
-        This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxRegistersPerBlock
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceNotLicensed
 
+        Maximum number of 32-bit registers available per block
 
-        This indicates that the device doesn't have a valid Grid License.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrClockRate
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSoftwareValidityNotEstablished
 
+        Peak clock frequency in kilohertz
 
-        By default, the CUDA runtime may perform a minimal set of self-tests, as well as CUDA driver tests, to establish the validity of both. Introduced in CUDA 11.2, this error return indicates that at least one of these tests has failed and the validity of either the runtime or the driver could not be established.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTextureAlignment
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStartupFailure
 
+        Alignment requirement for textures
 
-        This indicates an internal startup failure in the CUDA runtime.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuOverlap
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidKernelImage
 
+        Device can possibly copy memory and execute a kernel concurrently
 
-        This indicates that the device kernel image is invalid.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMultiProcessorCount
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceUninitialized
 
+        Number of multiprocessors on device
 
-        This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :py:obj:`~.cuCtxDestroy()` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See :py:obj:`~.cuCtxGetApiVersion()` for more details.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrKernelExecTimeout
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMapBufferObjectFailed
 
+        Specifies whether there is a run time limit on kernels
 
-        This indicates that the buffer object could not be mapped.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIntegrated
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnmapBufferObjectFailed
 
+        Device is integrated with host memory
 
-        This indicates that the buffer object could not be unmapped.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanMapHostMemory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorArrayIsMapped
 
+        Device can map host memory into CUDA address space
 
-        This indicates that the specified array is currently mapped and thus cannot be destroyed.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeMode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAlreadyMapped
 
+        Compute mode (See :py:obj:`~.cudaComputeMode` for details)
 
-        This indicates that the resource is already mapped.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNoKernelImageForDevice
 
+        Maximum 1D texture width
 
-        This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAlreadyAcquired
 
+        Maximum 2D texture width
 
-        This indicates that a resource has already been acquired.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMapped
 
+        Maximum 2D texture height
 
-        This indicates that a resource is not mapped.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMappedAsArray
 
+        Maximum 3D texture width
 
-        This indicates that a mapped resource is not available for access as an array.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotMappedAsPointer
 
+        Maximum 3D texture height
 
-        This indicates that a mapped resource is not available for access as a pointer.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DDepth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorECCUncorrectable
 
+        Maximum 3D texture depth
 
-        This indicates that an uncorrectable ECC error was detected during execution.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedLimit
 
+        Maximum 2D layered texture width
 
-        This indicates that the :py:obj:`~.cudaLimit` passed to the API call is not supported by the active device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceAlreadyInUse
 
+        Maximum 2D layered texture height
 
-        This indicates that a call tried to access an exclusive-thread device that is already in use by a different thread.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessUnsupported
 
+        Maximum layers in a 2D layered texture
 
-        This error indicates that P2P access is not supported across the given devices.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSurfaceAlignment
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPtx
 
+        Alignment requirement for surfaces
 
-        A PTX compilation failed. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrConcurrentKernels
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidGraphicsContext
 
+        Device can possibly execute multiple kernels concurrently
 
-        This indicates an error with the OpenGL or DirectX context.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrEccEnabled
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNvlinkUncorrectable
 
+        Device has ECC support enabled
 
-        This indicates that an uncorrectable NVLink error was detected during the execution.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciBusId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorJitCompilerNotFound
 
+        PCI bus ID of the device
 
-        This indicates that the PTX JIT compiler library was not found. The JIT Compiler library is used for PTX compilation. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciDeviceId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedPtxVersion
 
+        PCI device ID of the device
 
-        This indicates that the provided PTX was compiled with an unsupported toolchain. The most common reason for this, is the PTX was generated by a compiler newer than what is supported by the CUDA driver and PTX JIT compiler.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTccDriver
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorJitCompilationDisabled
 
+        Device is using TCC driver model
 
-        This indicates that the JIT compilation was disabled. The JIT compilation compiles PTX. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryClockRate
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedExecAffinity
 
+        Peak memory clock frequency in kilohertz
 
-        This indicates that the provided execution affinity is not supported by the device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGlobalMemoryBusWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedDevSideSync
 
+        Global memory bus width in bits
 
-        This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrL2CacheSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContained
 
+        Size of L2 cache in bytes
 
-        This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerMultiProcessor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSource
 
+        Maximum resident threads per multiprocessor
 
-        This indicates that the device kernel source is invalid.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrAsyncEngineCount
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorFileNotFound
 
+        Number of asynchronous engines
 
-        This indicates that the file specified was not found.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrUnifiedAddressing
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSharedObjectSymbolNotFound
 
+        Device shares a unified address space with the host
 
-        This indicates that a link to a shared object failed to resolve.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSharedObjectInitFailed
 
+        Maximum 1D layered texture width
 
-        This indicates that initialization of a shared object failed.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorOperatingSystem
 
+        Maximum layers in a 1D layered texture
 
-        This error indicates that an OS call failed.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceHandle
 
+        Maximum 2D texture width if cudaArrayTextureGather is set
 
-        This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like :py:obj:`~.cudaStream_t` and :py:obj:`~.cudaEvent_t`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalState
 
+        Maximum 2D texture height if cudaArrayTextureGather is set
 
-        This indicates that a resource required by the API call is not in a valid state to perform the requested operation.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DWidthAlt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLossyQuery
 
+        Alternate maximum 3D texture width
 
-        This indicates an attempt was made to introspect an object in a way that would discard semantically important information. This is either due to the object using funtionality newer than the API version used to introspect it or omission of optional return arguments.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DHeightAlt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSymbolNotFound
 
+        Alternate maximum 3D texture height
 
-        This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, driver function names, texture names, and surface names.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DDepthAlt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotReady
 
+        Alternate maximum 3D texture depth
 
-        This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than :py:obj:`~.cudaSuccess` (which indicates completion). Calls that may return this value include :py:obj:`~.cudaEventQuery()` and :py:obj:`~.cudaStreamQuery()`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciDomainId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalAddress
 
+        PCI domain ID of the device
 
-        The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTexturePitchAlignment
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchOutOfResources
 
+        Pitch alignment requirement for textures
 
-        This indicates that a launch did not occur because it did not have appropriate resources. Although this error is similar to :py:obj:`~.cudaErrorInvalidConfiguration`, this error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchTimeout
 
+        Maximum cubemap texture width/height
 
-        This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute :py:obj:`~.cudaDevAttrKernelExecTimeout` for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchIncompatibleTexturing
 
+        Maximum cubemap layered texture width/height
 
-        This error indicates a kernel launch that uses an incompatible texturing mode.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessAlreadyEnabled
 
+        Maximum layers in a cubemap layered texture
 
-        This error indicates that a call to :py:obj:`~.cudaDeviceEnablePeerAccess()` is trying to re-enable peer addressing on from a context which has already had peer addressing enabled.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPeerAccessNotEnabled
 
+        Maximum 1D surface width
 
-        This error indicates that :py:obj:`~.cudaDeviceDisablePeerAccess()` is trying to disable peer addressing which has not been enabled yet via :py:obj:`~.cudaDeviceEnablePeerAccess()`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSetOnActiveProcess
 
+        Maximum 2D surface width
 
-        This indicates that the user has called :py:obj:`~.cudaSetValidDevices()`, :py:obj:`~.cudaSetDeviceFlags()`, :py:obj:`~.cudaD3D9SetDirect3DDevice()`, :py:obj:`~.cudaD3D10SetDirect3DDevice`, :py:obj:`~.cudaD3D11SetDirect3DDevice()`, or :py:obj:`~.cudaVDPAUSetVDPAUDevice()` after initializing the CUDA runtime by calling non-device management operations (allocating memory and launching kernels are examples of non-device management operations). This error can also be returned if using runtime/driver interoperability and there is an existing :py:obj:`~.CUcontext` active on the host thread.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContextIsDestroyed
 
+        Maximum 2D surface height
 
-        This error indicates that the context current to the calling thread has been destroyed using :py:obj:`~.cuCtxDestroy`, or is a primary context which has not yet been initialized.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAssert
 
+        Maximum 3D surface width
 
-        An assert triggered in device code during kernel execution. The device cannot be used again. All existing allocations are invalid. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTooManyPeers
 
+        Maximum 3D surface height
 
-        This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to :py:obj:`~.cudaEnablePeerAccess()`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DDepth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHostMemoryAlreadyRegistered
 
+        Maximum 3D surface depth
 
-        This error indicates that the memory range passed to :py:obj:`~.cudaHostRegister()` has already been registered.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHostMemoryNotRegistered
 
+        Maximum 1D layered surface width
 
-        This error indicates that the pointer passed to :py:obj:`~.cudaHostUnregister()` does not correspond to any currently registered memory region.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHardwareStackError
 
+        Maximum layers in a 1D layered surface
 
-        Device encountered an error in the call stack during kernel execution, possibly due to stack corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIllegalInstruction
 
+        Maximum 2D layered surface width
 
-        The device encountered an illegal instruction during kernel execution This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMisalignedAddress
 
+        Maximum 2D layered surface height
 
-        The device encountered a load or store instruction on a memory address which is not aligned. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidAddressSpace
 
+        Maximum layers in a 2D layered surface
 
-        While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidPc
 
+        Maximum cubemap surface width
 
-        The device encountered an invalid program counter. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapLayeredWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchFailure
 
+        Maximum cubemap layered surface width
 
-        An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. Less common cases can be system specific - more information about these cases can be found in the system specific user guide. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapLayeredLayers
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCooperativeLaunchTooLarge
 
+        Maximum layers in a cubemap layered surface
 
-        This error indicates that the number of blocks launched per grid for a kernel that was launched via either :py:obj:`~.cudaLaunchCooperativeKernel` exceeds the maximum number of blocks as allowed by :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` or :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :py:obj:`~.cudaDevAttrMultiProcessorCount`.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLinearWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTensorMemoryLeak
 
+        Maximum 1D linear texture width
 
-        An exception occurred on the device while exiting a kernel using tensor memory: the tensor memory was not completely deallocated. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotPermitted
 
+        Maximum 2D linear texture width
 
-        This error indicates the attempted operation is not permitted.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotSupported
 
+        Maximum 2D linear texture height
 
-        This error indicates the attempted operation is not supported on the current system or device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearPitch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSystemNotReady
 
+        Maximum 2D linear texture pitch in bytes
 
-        This error indicates that the system is not yet ready to start any CUDA work. To continue using CUDA, verify the system configuration is in a valid state and all required driver daemons are actively running. More information about this error can be found in the system specific user guide.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DMipmappedWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSystemDriverMismatch
 
+        Maximum mipmapped 2D texture width
 
-        This error indicates that there is a mismatch between the versions of the display driver and the CUDA driver. Refer to the compatibility documentation for supported versions.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DMipmappedHeight
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCompatNotSupportedOnDevice
 
+        Maximum mipmapped 2D texture height
 
-        This error indicates that the system was upgraded to run with forward compatibility but the visible hardware detected by CUDA does not support this configuration. Refer to the compatibility documentation for the supported hardware matrix or ensure that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES environment variable.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsConnectionFailed
 
+        Major compute capability version number
 
-        This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsRpcFailure
 
+        Minor compute capability version number
 
-        This error indicates that the remote procedural call between the MPS server and the MPS client failed.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DMipmappedWidth
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsServerNotReady
 
+        Maximum mipmapped 1D texture width
 
-        This error indicates that the MPS server is not ready to accept new MPS client requests. This error can be returned when the MPS server is in the process of recovering from a fatal failure.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrStreamPrioritiesSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsMaxClientsReached
 
+        Device supports stream priorities
 
-        This error indicates that the hardware resources required to create MPS client have been exhausted.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGlobalL1CacheSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsMaxConnectionsReached
 
+        Device supports caching globals in L1
 
-        This error indicates the the hardware resources required to device connections have been exhausted.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrLocalL1CacheSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMpsClientTerminated
 
+        Device supports caching locals in L1
 
-        This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerMultiprocessor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCdpNotSupported
 
+        Maximum shared memory available per multiprocessor in bytes
 
-        This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxRegistersPerMultiprocessor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCdpVersionMismatch
 
+        Maximum number of 32-bit registers available per multiprocessor
 
-        This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrManagedMemory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnsupported
 
+        Device can allocate managed memory on this system
 
-        The operation is not permitted when the stream is capturing.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIsMultiGpuBoard
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureInvalidated
 
+        Device is on a multi-GPU board
 
-        The current capture sequence on the stream has been invalidated due to a previous error.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMultiGpuBoardGroupID
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureMerge
 
+        Unique identifier for a group of devices on the same multi-GPU board
 
-        The operation would have resulted in a merge of two independent capture sequences.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNativeAtomicSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnmatched
 
+        Link between the device and the host supports native atomic operations
 
-        The capture was not initiated in this stream.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSingleToDoublePrecisionPerfRatio
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureUnjoined
 
+        Ratio of single precision performance (in floating-point operations per second) to double precision performance
 
-        The capture sequence contains a fork that was not joined to the primary stream.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureIsolation
 
+        Device supports coherently accessing pageable memory without calling cudaHostRegister on it
 
-        A dependency would have been created which crosses the capture sequence boundary. Only implicit in-stream ordering dependencies are allowed to cross the boundary.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureImplicit
 
+        Device can coherently access managed memory concurrently with the CPU
 
-        The operation would have resulted in a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputePreemptionSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCapturedEvent
 
+        Device supports Compute Preemption
 
-        The operation is not permitted on an event which was last recorded in a capturing stream.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanUseHostPointerForRegisteredMem
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureWrongThread
 
+        Device can access host registered memory at the same virtual address as the CPU
 
-        A stream capture sequence not initiated with the :py:obj:`~.cudaStreamCaptureModeRelaxed` argument to :py:obj:`~.cudaStreamBeginCapture` was passed to :py:obj:`~.cudaStreamEndCapture` in a different thread.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved92
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTimeout
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved93
 
-        This indicates that the wait operation has timed out.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved94
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorGraphExecUpdateFailure
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCooperativeLaunch
 
-        This error indicates that the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.
 
+        Device supports launching cooperative kernels via :py:obj:`~.cudaLaunchCooperativeKernel`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorExternalDevice
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved96
 
-        This indicates that an error has occurred in a device outside of GPU. It can be a synchronous error w.r.t. CUDA API or an asynchronous error from the external device. In case of asynchronous error, it means that if cuda was waiting for an external device's signal before consuming shared data, the external device signaled an error indicating that the data is not valid for consumption. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched. In case of synchronous error, it means that one or more external devices have encountered an error and cannot complete the operation.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlockOptin
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidClusterSize
 
+        The maximum optin shared memory per block. This value may vary by chip. See :py:obj:`~.cudaFuncSetAttribute`
 
-        This indicates that a kernel launch error has occurred due to cluster misconfiguration.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanFlushRemoteWrites
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorFunctionNotLoaded
 
+        Device supports flushing of outstanding remote writes.
 
-        Indiciates a function handle is not loaded when calling an API that requires a loaded function.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostRegisterSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceType
 
+        Device supports host memory registration via :py:obj:`~.cudaHostRegister`.
 
-        This error indicates one or more resources passed in are not valid resource types for the operation.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccessUsesHostPageTables
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidResourceConfiguration
 
+        Device accesses pageable memory via the host's page tables.
 
-        This error indicates one or more resources are insufficient or non-applicable for the operation.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrDirectManagedMemAccessFromHost
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamDetached
 
+        Host can directly access managed memory on the device without migration.
 
-        This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlocksPerMultiprocessor
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnknown
 
+        Maximum number of blocks per multiprocessor
 
-        This indicates that an unknown internal error has occurred.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxPersistingL2CacheSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorApiFailureBase
 
-.. autoclass:: cuda.bindings.runtime.cudaChannelFormatKind
+        Maximum L2 persisting lines capacity setting in bytes.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSigned
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxAccessPolicyWindowSize
 
-        Signed channel format
 
+        Maximum value of :py:obj:`~.cudaAccessPolicyWindow.num_bytes`.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReservedSharedMemoryPerBlock
 
-        Unsigned channel format
 
+        Shared memory reserved by CUDA driver per block in bytes
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindFloat
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSparseCudaArraySupported
 
-        Float channel format
 
+        Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostRegisterReadOnlySupported
 
-        No channel format
 
+        Device supports using the :py:obj:`~.cudaHostRegister` flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindNV12
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTimelineSemaphoreInteropSupported
 
-        Unsigned 8-bit integers, planar 4:2:0 YUV format
 
+        External timeline semaphore interop is supported on the device
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported
 
-        1 channel unsigned 8-bit normalized integer
 
+        Device supports using the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMASupported
 
-        2 channel unsigned 8-bit normalized integer
 
+        Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized8X4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMAFlushWritesOptions
 
-        4 channel unsigned 8-bit normalized integer
 
+        The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMAWritesOrdering
 
-        1 channel unsigned 16-bit normalized integer
 
+        GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` for the numerical values returned here.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolSupportedHandleTypes
 
-        2 channel unsigned 16-bit normalized integer
 
+        Handle types supported with mempool based IPC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized16X4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrClusterLaunch
 
-        4 channel unsigned 16-bit normalized integer
 
+        Indicates device supports cluster launch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrDeferredMappingCudaArraySupported
 
-        1 channel signed 8-bit normalized integer
 
+        Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved122
 
-        2 channel signed 8-bit normalized integer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved123
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized8X4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved124
 
-        4 channel signed 8-bit normalized integer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIpcEventSupport
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X1
 
+        Device supports IPC Events.
 
-        1 channel signed 16-bit normalized integer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemSyncDomainCount
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X2
 
+        Number of memory synchronization domains the device supports.
 
-        2 channel signed 16-bit normalized integer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved127
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedNormalized16X4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved128
 
-        4 channel signed 16-bit normalized integer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved129
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrNumaConfig
 
-        4 channel unsigned normalized block-compressed (BC1 compression) format
 
+        NUMA configuration of a device: value is of type :py:obj:`~.cudaDeviceNumaConfig` enum
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed1SRGB
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrNumaId
 
-        4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding
 
+        NUMA node ID of the GPU memory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved132
 
-        4 channel unsigned normalized block-compressed (BC2 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMpsEnabled
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed2SRGB
 
+        Contexts created on this device will be shared via MPS
 
-        4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed3
 
+        NUMA ID of the host node closest to the device or -1 when system does not support NUMA
 
-        4 channel unsigned normalized block-compressed (BC3 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrD3D12CigSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed3SRGB
 
+        Device supports CIG with D3D12.
 
-        4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrVulkanCigSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed4
 
+        Device supports CIG with Vulkan.
 
-        1 channel unsigned normalized block-compressed (BC4 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuPciDeviceId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed4
 
+        The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
 
-        1 channel signed normalized block-compressed (BC4 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuPciSubsystemId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed5
 
+        The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
 
-        2 channel unsigned normalized block-compressed (BC5 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved141
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed5
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMemoryPoolsSupported
 
-        2 channel signed normalized block-compressed (BC5 compression) format
 
+        Device supports HOST_NUMA location with the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed6H
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMultinodeIpcSupported
 
-        3 channel unsigned half-float block-compressed (BC6H compression) format
 
+        Device supports HostNuma location IPC between nodes in a multi-node system.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindSignedBlockCompressed6H
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported
 
-        3 channel signed half-float block-compressed (BC6H compression) format
 
+        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed7
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved145
 
-        4 channel unsigned normalized block-compressed (BC7 compression) format
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrOnlyPartialHostNativeAtomicSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedBlockCompressed7SRGB
 
+        Link between the device and the host supports only some native atomic operations
 
-        4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized1010102
+.. autoclass:: cuda.bindings.runtime.cudaMemPoolAttr
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies
 
-        4 channel unsigned normalized (10-bit, 10-bit, 10-bit, 2-bit) format
 
-.. autoclass:: cuda.bindings.runtime.cudaMemoryType
+        (value type = int) Allow cuMemAllocAsync to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeUnregistered
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic
 
-        Unregistered memory
 
+        (value type = int) Allow reuse of already completed frees when there is no dependency between the free and allocation. (default enabled)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeHost
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies
 
-        Host memory
 
+        (value type = int) Allow cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by cuFreeAsync (default enabled).
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeDevice
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold
 
-        Device memory
 
+        (value type = cuuint64_t) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeManaged
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent
 
-        Managed memory
 
-.. autoclass:: cuda.bindings.runtime.cudaMemcpyKind
+        (value type = cuuint64_t) Amount of backing memory currently allocated for the mempool.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyHostToHost
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh
 
-        Host -> Host
 
+        (value type = cuuint64_t) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent
 
-        Host -> Device
 
+        (value type = cuuint64_t) Amount of memory from the pool that is currently in use by the application.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh
 
-        Device -> Host
 
+        (value type = cuuint64_t) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType
 
-        Device -> Device
 
+        (value type = cudaMemAllocationType) The allocation type of the mempool
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyKind.cudaMemcpyDefault
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes
 
-        Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing
 
-.. autoclass:: cuda.bindings.runtime.cudaAccessProperty
+        (value type = cudaMemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always cudaMemHandleTypeNone as an imported pool cannot be re-exported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyNormal
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationId
 
-        Normal cache persistence.
 
+        (value type = int) The location id for the mempool. If the location type for this pool is cudaMemLocationTypeInvisible then ID will be cudaInvalidDeviceId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyStreaming
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationType
 
-        Streaming access is less likely to persit from cache.
 
+        (value type = cudaMemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be cudaMemLocationTypeInvisible
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAccessProperty.cudaAccessPropertyPersisting
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrMaxPoolSize
 
-        Persisting access is more likely to persist in cache.
 
-.. autoclass:: cuda.bindings.runtime.cudaStreamCaptureStatus
+        (value type = cuuint64_t) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cudaMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For cudaMemAllocationTypeManaged and IPC imported pools this value will be system dependent.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrHwDecompressEnabled
 
-        Stream is not capturing
 
+        (value type = int) Indicates whether the pool has hardware compresssion enabled
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
+.. autoclass:: cuda.bindings.runtime.cudaMemLocationType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvalid
 
-        Stream is actively capturing
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureStatus.cudaStreamCaptureStatusInvalidated
 
+        Location is unspecified. This is used when creating a managed memory pool to indicate no preferred location for the pool
 
-        Stream is part of a capture sequence that has been invalidated, but not terminated
 
-.. autoclass:: cuda.bindings.runtime.cudaStreamCaptureMode
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeDevice
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
 
+        Location is a device location, thus id is a device ordinal
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHost
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed
 
-.. autoclass:: cuda.bindings.runtime.cudaSynchronizationPolicy
+        Location is host, id is ignored
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyAuto
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHostNuma
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicySpin
 
+        Location is a host NUMA node, thus id is a host NUMA node id
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyYield
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHostNumaCurrent
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSynchronizationPolicy.cudaSyncPolicyBlockingSync
 
-.. autoclass:: cuda.bindings.runtime.cudaClusterSchedulingPolicy
+        Location is the host NUMA node closest to the current thread's CPU, id is ignored
 
-    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicyDefault
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvisible
 
-        the default policy
 
+        Location is not visible but device is accessible, id is always cudaInvalidDeviceId
 
-    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicySpread
+.. autoclass:: cuda.bindings.runtime.cudaMemAccessFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtNone
 
-        spread the blocks within a cluster to the SMs
 
+        Default, make the address range not accessible
 
-    .. autoattribute:: cuda.bindings.runtime.cudaClusterSchedulingPolicy.cudaClusterSchedulingPolicyLoadBalancing
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtRead
 
-        allow the hardware to load-balance the blocks in a cluster to the SMs
 
-.. autoclass:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags
+        Make the address range read accessible
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamAddCaptureDependencies
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtReadWrite
 
-        Add new nodes to the dependency set
 
+        Make the address range read-write accessible
 
-    .. autoattribute:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies
+.. autoclass:: cuda.bindings.runtime.cudaMemAllocationType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeInvalid
 
-        Replace the dependency set with the new nodes
 
-.. autoclass:: cuda.bindings.runtime.cudaUserObjectFlags
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypePinned
 
-    .. autoattribute:: cuda.bindings.runtime.cudaUserObjectFlags.cudaUserObjectNoDestructorSync
 
+        This allocation type is 'pinned', i.e. cannot migrate from its current location while the application is actively using it
 
-        Indicates the destructor execution is not synchronized by any CUDA handle.
 
-.. autoclass:: cuda.bindings.runtime.cudaUserObjectRetainFlags
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeManaged
 
-    .. autoattribute:: cuda.bindings.runtime.cudaUserObjectRetainFlags.cudaGraphUserObjectMove
 
+        This allocation type is managed memory
 
-        Transfer references from the caller rather than creating new references.
 
-.. autoclass:: cuda.bindings.runtime.cudaHostTaskSyncMode
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaHostTaskSyncMode.cudaHostTaskBlocking
+.. autoclass:: cuda.bindings.runtime.cudaMemAllocationHandleType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaHostTaskSyncMode.cudaHostTaskSpinWait
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphicsRegisterFlags
+        Does not allow any export mechanism. >
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypePosixFileDescriptor
 
-        Default
 
+        Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsReadOnly
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeWin32
 
-        CUDA will not write to this resource
 
+        Allows a Win32 NT handle to be used for exporting. (HANDLE)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeWin32Kmt
 
-        CUDA will only write to and will not read from this resource
 
+        Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsSurfaceLoadStore
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric
 
-        CUDA will bind this resource to a surface reference
 
+        Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsTextureGather
+.. autoclass:: cuda.bindings.runtime.cudaGraphMemAttributeType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemCurrent
 
-        CUDA will perform texture gather operations on this resource
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphicsMapFlags
+        (value type = cuuint64_t) Amount of memory, in bytes, currently associated with graphs.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemHigh
 
-        Default; Assume resource can be read/written
 
+        (value type = cuuint64_t) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsReadOnly
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemCurrent
 
-        CUDA will not write to this resource
 
+        (value type = cuuint64_t) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsMapFlags.cudaGraphicsMapFlagsWriteDiscard
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemHigh
 
-        CUDA will only write to and will not read from this resource
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphicsCubeFace
+        (value type = cuuint64_t) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveX
+.. autoclass:: cuda.bindings.runtime.cudaMemcpyFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyFlags.cudaMemcpyFlagDefault
 
-        Positive X face of cubemap
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyFlags.cudaMemcpyFlagPreferOverlapWithCompute
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeX
 
+        Hint to the driver to try and overlap the copy with compute work on the SMs.
 
-        Negative X face of cubemap
+.. autoclass:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderInvalid
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveY
 
+        Default invalid.
 
-        Positive Y face of cubemap
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderStream
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeY
 
+        Indicates that access to the source pointer must be in stream order.
 
-        Negative Y face of cubemap
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderDuringApiCall
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFacePositiveZ
 
+        Indicates that access to the source pointer can be out of stream order and all accesses must be complete before the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the need for the user to synchronize the stream after the API call.
 
-        Positive Z face of cubemap
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderAny
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphicsCubeFace.cudaGraphicsCubeFaceNegativeZ
 
+        Indicates that access to the source pointer can be out of stream order and the accesses can happen even after the API call returns. This flag is suited for host pointers allocated outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory. Specifying this flag allows the driver to optimize the copy on certain platforms.
 
-        Negative Z face of cubemap
 
-.. autoclass:: cuda.bindings.runtime.cudaResourceType
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeArray
+.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DOperandType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypePointer
 
-        Array resource
 
+        Memcpy operand is a valid pointer.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeMipmappedArray
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeArray
 
-        Mipmapped array resource
 
+        Memcpy operand is a CUarray.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypeLinear
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeMax
 
-        Linear resource
+.. autoclass:: cuda.bindings.runtime.cudaDeviceP2PAttr
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrPerformanceRank
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceType.cudaResourceTypePitch2D
 
+        A relative value indicating the performance of the link between two devices
 
-        Pitch 2D resource
 
-.. autoclass:: cuda.bindings.runtime.cudaResourceViewFormat
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrAccessSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatNone
 
+        Peer access is enabled
 
-        No resource view format (use underlying resource format)
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrNativeAtomicSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar1
 
+        Native atomic operation over the link supported
 
-        1 channel unsigned 8-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrCudaArrayAccessSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar2
 
+        Accessing CUDA arrays over the link supported
 
-        2 channel unsigned 8-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrOnlyPartialNativeAtomicSupported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedChar4
 
+        Only some CUDA-valid atomic operations over the link are supported.
 
-        4 channel unsigned 8-bit integers
+.. autoclass:: cuda.bindings.runtime.cudaAtomicOperation
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerAdd
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMin
 
-        1 channel signed 8-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerIncrement
 
-        2 channel signed 8-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerDecrement
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedChar4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationAnd
 
-        4 channel signed 8-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationOr
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationXOR
 
-        1 channel unsigned 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationExchange
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationCAS
 
-        2 channel unsigned 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatAdd
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedShort4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMin
 
-        4 channel unsigned 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMax
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort1
+.. autoclass:: cuda.bindings.runtime.cudaAtomicOperationCapability
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilitySigned
 
-        1 channel signed 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityUnsigned
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityReduction
 
-        2 channel signed 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar32
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedShort4
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar64
 
-        4 channel signed 16-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar128
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityVector32x4
 
-        1 channel unsigned 32-bit integers
+.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryHandleType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueFd
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt2
 
+        Handle is an opaque file descriptor
 
-        2 channel unsigned 32-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueWin32
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedInt4
 
+        Handle is an opaque shared NT handle
 
-        4 channel unsigned 32-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueWin32Kmt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt1
 
+        Handle is an opaque, globally shared handle
 
-        1 channel signed 32-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D12Heap
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt2
 
+        Handle is a D3D12 heap object
 
-        2 channel signed 32-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D12Resource
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedInt4
 
+        Handle is a D3D12 committed resource
 
-        4 channel signed 32-bit integers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D11Resource
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf1
 
+        Handle is a shared NT handle to a D3D11 resource
 
-        1 channel 16-bit floating point
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D11ResourceKmt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf2
 
+        Handle is a globally shared handle to a D3D11 resource
 
-        2 channel 16-bit floating point
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeNvSciBuf
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatHalf4
 
+        Handle is an NvSciBuf object
 
-        4 channel 16-bit floating point
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueFd
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat1
 
+        Handle is an opaque file descriptor
 
-        1 channel 32-bit floating point
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueWin32
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat2
 
+        Handle is an opaque shared NT handle
 
-        2 channel 32-bit floating point
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatFloat4
 
+        Handle is an opaque, globally shared handle
 
-        4 channel 32-bit floating point
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeD3D12Fence
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed1
 
+        Handle is a shared NT handle referencing a D3D12 fence object
 
-        Block compressed 1
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeD3D11Fence
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed2
 
+        Handle is a shared NT handle referencing a D3D11 fence object
 
-        Block compressed 2
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeNvSciSync
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed3
 
+        Opaque handle to NvSciSync Object
 
-        Block compressed 3
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeKeyedMutex
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed4
 
+        Handle is a shared NT handle referencing a D3D11 keyed mutex object
 
-        Block compressed 4 unsigned
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeKeyedMutexKmt
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed4
 
+        Handle is a shared KMT handle referencing a D3D11 keyed mutex object
 
-        Block compressed 4 signed
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed5
 
+        Handle is an opaque handle file descriptor referencing a timeline semaphore
 
-        Block compressed 5 unsigned
 
+    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed5
 
+        Handle is an opaque handle file descriptor referencing a timeline semaphore
 
-        Block compressed 5 signed
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed6H
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupBackfill
 
-        Block compressed 6 unsigned half-float
 
+        Lets smCount be a non-multiple of minCoscheduledCount, filling the difference with other SMs.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatSignedBlockCompressed6H
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitIgnoreSmCoscheduling
 
-        Block compressed 6 signed half-float
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitMaxPotentialClusterSize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaResourceViewFormat.cudaResViewFormatUnsignedBlockCompressed7
+.. autoclass:: cuda.bindings.runtime.cudaDevResourceType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeInvalid
 
-        Block compressed 7
 
-.. autoclass:: cuda.bindings.runtime.cudaFuncAttribute
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeSm
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeMaxDynamicSharedMemorySize
 
+        Streaming multiprocessors related information
 
-        Maximum dynamic shared memory size
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueueConfig
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributePreferredSharedMemoryCarveout
 
+        Workqueue configuration related information
 
-        Preferred shared memory-L1 cache split
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueue
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeClusterDimMustBeSet
 
+        Pre-existing workqueue related information
 
-        Indicator to enforce valid cluster dimension specification on kernel launch
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeDeviceCtx
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterWidth
 
+        Use all shared workqueue resources on the device. Default driver behaviour.
 
-        Required cluster width
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeGreenCtxBalanced
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterHeight
 
+        When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
-        Required cluster height
+.. autoclass:: cuda.bindings.runtime.cudaJitOption
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMaxRegisters
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeRequiredClusterDepth
 
+        Max number of registers that a thread may use.
 
-        Required cluster depth
+        Option type: unsigned int
 
+        Applies to: compiler only
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeNonPortableClusterSizeAllowed
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitThreadsPerBlock
 
-        Whether non-portable cluster scheduling policy is supported
 
+        IN: Specifies minimum number of threads per block to target compilation for
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeClusterSchedulingPolicyPreference
+        OUT: Returns the number of threads the compiler actually targeted. This restricts the resource utilization of the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations. Note, this option does not currently take into account any other resource limitations, such as shared memory utilization.
 
+        Option type: unsigned int
 
-        Required cluster scheduling policy preference
+        Applies to: compiler only
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeMax
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitWallTime
 
-.. autoclass:: cuda.bindings.runtime.cudaFuncCache
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferNone
+        Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker
 
+        Option type: float
 
-        Default function cache configuration, no preference
+        Applies to: compiler and linker
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferShared
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitInfoLogBuffer
 
 
-        Prefer larger shared memory and smaller L1 cache
+        Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.cudaJitInfoLogBufferSizeBytes`)
 
+        Option type: char *
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferL1
+        Applies to: compiler and linker
 
 
-        Prefer larger L1 cache and smaller shared memory
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitInfoLogBufferSizeBytes
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFuncCache.cudaFuncCachePreferEqual
+        IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
 
+        OUT: Amount of log buffer filled with messages
 
-        Prefer equal size L1 cache and shared memory
+        Option type: unsigned int
 
-.. autoclass:: cuda.bindings.runtime.cudaSharedMemConfig
+        Applies to: compiler and linker
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeDefault
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitErrorLogBuffer
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeFourByte
 
+        Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.cudaJitErrorLogBufferSizeBytes`)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemConfig.cudaSharedMemBankSizeEightByte
+        Option type: char *
 
-.. autoclass:: cuda.bindings.runtime.cudaSharedCarveout
+        Applies to: compiler and linker
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutDefault
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitErrorLogBufferSizeBytes
 
-        No preference for shared memory or L1 (default)
 
+        IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutMaxShared
+        OUT: Amount of log buffer filled with messages
 
+        Option type: unsigned int
 
-        Prefer maximum available shared memory, minimum L1 cache
+        Applies to: compiler and linker
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedCarveout.cudaSharedmemCarveoutMaxL1
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitOptimizationLevel
 
 
-        Prefer maximum available L1 cache, minimum shared memory
+        Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations.
 
-.. autoclass:: cuda.bindings.runtime.cudaComputeMode
+        Option type: unsigned int
 
-    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeDefault
+        Applies to: compiler only
 
 
-        Default compute mode (Multiple threads can use :py:obj:`~.cudaSetDevice()` with this device)
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitFallbackStrategy
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeExclusive
+        Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied :py:obj:`~.cudaJit_Fallback`. Option type: unsigned int for enumerated type :py:obj:`~.cudaJit_Fallback`
 
+        Applies to: compiler only
 
-        Compute-exclusive-thread mode (Only one thread in one process will be able to use :py:obj:`~.cudaSetDevice()` with this device)
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitGenerateDebugInfo
 
-    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeProhibited
 
+        Specifies whether to create debug information in output (-g) (0: false, default)
 
-        Compute-prohibited mode (No threads can use :py:obj:`~.cudaSetDevice()` with this device)
+        Option type: int
 
+        Applies to: compiler and linker
 
-    .. autoattribute:: cuda.bindings.runtime.cudaComputeMode.cudaComputeModeExclusiveProcess
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitLogVerbose
 
-        Compute-exclusive-process mode (Many threads in one process will be able to use :py:obj:`~.cudaSetDevice()` with this device)
 
-.. autoclass:: cuda.bindings.runtime.cudaLimit
+        Generate verbose log messages (0: false, default)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitStackSize
+        Option type: int
 
+        Applies to: compiler and linker
 
-        GPU thread stack size
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitGenerateLineInfo
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitPrintfFifoSize
 
+        Generate line number information (-lineinfo) (0: false, default)
 
-        GPU printf FIFO size
+        Option type: int
 
+        Applies to: compiler only
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitMallocHeapSize
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode
 
-        GPU malloc heap size
 
+        Specifies whether to enable caching explicitly (-dlcm) 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitDevRuntimeSyncDepth
+        Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`.
 
+        Option type: unsigned int for enumerated type :py:obj:`~.cudaJit_CacheMode`
 
-        GPU device runtime synchronize depth
+        Applies to: compiler only
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitDevRuntimePendingLaunchCount
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitPositionIndependentCode
 
 
-        GPU device runtime pending launch count
+        Generate position independent code (0: false)
 
+        Option type: int
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitMaxL2FetchGranularity
+        Applies to: compiler only
 
 
-        A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMinCtaPerSm
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLimit.cudaLimitPersistingL2CacheSize
+        This option hints to the JIT compiler the minimum number of CTAs from the kernel’s grid to be mapped to a SM. This option is ignored when used together with :py:obj:`~.cudaJitMaxRegisters` or :py:obj:`~.cudaJitThreadsPerBlock`. Optimizations based on this option need :py:obj:`~.cudaJitMaxThreadsPerBlock` to be specified as well. For kernels already using PTX directive .minnctapersm, this option will be ignored by default. Use :py:obj:`~.cudaJitOverrideDirectiveValues` to let this option take precedence over the PTX directive. Option type: unsigned int
 
+        Applies to: compiler only
 
-        A size in bytes for L2 persisting lines cache size
 
-.. autoclass:: cuda.bindings.runtime.cudaMemoryAdvise
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMaxThreadsPerBlock
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetReadMostly
 
+        Maximum number threads in a thread block, computed as the product of the maximum extent specifed for each dimension of the block. This limit is guaranteed not to be exeeded in any invocation of the kernel. Exceeding the the maximum number of threads results in runtime error or kernel launch failure. For kernels already using PTX directive .maxntid, this option will be ignored by default. Use :py:obj:`~.cudaJitOverrideDirectiveValues` to let this option take precedence over the PTX directive. Option type: int
 
-        Data will mostly be read and only occassionally be written to
+        Applies to: compiler only
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetReadMostly
+    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitOverrideDirectiveValues
 
 
-        Undo the effect of :py:obj:`~.cudaMemAdviseSetReadMostly`
+        This option lets the values specified using :py:obj:`~.cudaJitMaxRegisters`, :py:obj:`~.cudaJitThreadsPerBlock`, :py:obj:`~.cudaJitMaxThreadsPerBlock` and :py:obj:`~.cudaJitMinCtaPerSm` take precedence over any PTX directives. (0: Disable, default; 1: Enable) Option type: int
 
+        Applies to: compiler only
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetPreferredLocation
+.. autoclass:: cuda.bindings.runtime.cudaLibraryOption
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryHostUniversalFunctionAndDataTable
 
-        Set the preferred location for the data as the specified device
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryBinaryIsPreserved
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetPreferredLocation
 
+        Specifes that the argument `code` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
 
-        Clear the preferred location for the data
+.. autoclass:: cuda.bindings.runtime.cudaJit_CacheMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionNone
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy
 
+        Compile with no -dlcm flag specified
 
-        Data will be accessed by the specified device, so prevent page faults as much as possible
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionCG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemoryAdvise.cudaMemAdviseUnsetAccessedBy
 
+        Compile with L1 cache disabled
 
-        Let the Unified Memory subsystem decide on the page faulting policy for the specified device
 
-.. autoclass:: cuda.bindings.runtime.cudaMemRangeAttribute
+    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionCA
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeReadMostly
 
+        Compile with L1 cache enabled
 
-        Whether the range will mostly be read and only occassionally be written to
+.. autoclass:: cuda.bindings.runtime.cudaJit_Fallback
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJit_Fallback.cudaPreferPtx
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocation
 
+        Prefer to compile ptx if exact binary match not found
 
-        The preferred location of the range
 
+    .. autoattribute:: cuda.bindings.runtime.cudaJit_Fallback.cudaPreferBinary
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeAccessedBy
 
+        Prefer to fall back to compatible binary code if exact match not found
 
-        Memory range has :py:obj:`~.cudaMemAdviseSetAccessedBy` set for specified device
+.. autoclass:: cuda.bindings.runtime.cudaCGScope
 
+    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeInvalid
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocation
 
+        Invalid cooperative group scope
 
-        The last location to which the range was prefetched
 
+    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeGrid
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocationType
 
+        Scope represented by a grid_group
 
-        The preferred location type of the range
 
+    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeReserved
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributePreferredLocationId
 
+        Reserved
 
-        The preferred location id of the range
+.. autoclass:: cuda.bindings.runtime.cudaKernelFunctionType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeUnspecified
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocationType
 
+        CUDA will attempt to deduce the type of the function handle
 
-        The last location type to which the range was prefetched
 
+    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeDeviceEntry
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemRangeAttribute.cudaMemRangeAttributeLastPrefetchLocationId
 
+        Function handle is a device-entry function pointer(i.e. global function pointer)
 
-        The last location id to which the range was prefetched
 
-.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions
+    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions.cudaFlushGPUDirectRDMAWritesOptionHost
 
+        Function handle is a cudaKernel_t
 
-        :py:obj:`~.cudaDeviceFlushGPUDirectRDMAWrites()` and its CUDA Driver API counterpart are supported on the device.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesOptions.cudaFlushGPUDirectRDMAWritesOptionMemOps
 
+        Function handle is a cudaFunction_t
 
-        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the CUDA device.
+.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags
 
-.. autoclass:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags.cudaGraphCondAssignDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingNone
 
+        Apply default handle value when graph is launched.
 
-        The device does not natively support ordering of GPUDirect RDMA writes. :py:obj:`~.cudaFlushGPUDirectRDMAWrites()` can be leveraged if supported.
+.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalNodeType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeIf
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingOwner
 
+        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
 
-        Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeWhile
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingAllDevices
 
+        Conditional 'while' Node. Body executed repeatedly while condition value is non-zero.
 
-        Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device.
 
-.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeSwitch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope.cudaFlushGPUDirectRDMAWritesToOwner
 
+        Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched.
 
-        Blocks until remote writes are visible to the CUDA device context owning the data.
+.. autoclass:: cuda.bindings.runtime.cudaGraphNodeType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeKernel
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesScope.cudaFlushGPUDirectRDMAWritesToAllDevices
 
+        GPU kernel node
 
-        Blocks until remote writes are visible to all CUDA device contexts.
 
-.. autoclass:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesTarget
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemcpy
 
-    .. autoattribute:: cuda.bindings.runtime.cudaFlushGPUDirectRDMAWritesTarget.cudaFlushGPUDirectRDMAWritesTargetCurrentDevice
 
+        Memcpy node
 
-        Sets the target for :py:obj:`~.cudaDeviceFlushGPUDirectRDMAWrites()` to the currently active CUDA device context.
 
-.. autoclass:: cuda.bindings.runtime.cudaDeviceAttr
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemset
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock
 
+        Memset node
 
-        Maximum number of threads per block
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeHost
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimX
 
+        Host (executable) node
 
-        Maximum block dimension X
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeGraph
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimY
 
+        Node which executes an embedded graph
 
-        Maximum block dimension Y
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeEmpty
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlockDimZ
 
+        Empty (no-op) node
 
-        Maximum block dimension Z
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeWaitEvent
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimX
 
+        External event wait node
 
-        Maximum grid dimension X
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeEventRecord
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimY
 
+        External event record node
 
-        Maximum grid dimension Y
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeExtSemaphoreSignal
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxGridDimZ
 
+        External semaphore signal node
 
-        Maximum grid dimension Z
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeExtSemaphoreWait
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlock
 
+        External semaphore wait node
 
-        Maximum shared memory available per block in bytes
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemAlloc
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTotalConstantMemory
 
+        Memory allocation node
 
-        Memory available on device for constant variables in a CUDA C kernel in bytes
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemFree
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrWarpSize
 
+        Memory free node
 
-        Warp size in threads
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeConditional
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxPitch
 
+        Conditional node                                    May be used to implement a conditional execution path or loop
 
-        Maximum pitch in bytes allowed by memory copies
+                                           inside of a graph. The graph(s) contained within the body of the conditional node
 
+                                           can be selectively executed or iterated upon based on the value of a conditional
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxRegistersPerBlock
+                                           variable.
 
 
-        Maximum number of 32-bit registers available per block
 
+                                           Handles must be created in advance of creating the node
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrClockRate
+                                           using :py:obj:`~.cudaGraphConditionalHandleCreate`.
 
 
-        Peak clock frequency in kilohertz
 
+                                           The following restrictions apply to graphs which contain conditional nodes:
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTextureAlignment
+                                             The graph cannot be used in a child node.
 
+                                             Only one instantiation of the graph may exist at any point in time.
 
-        Alignment requirement for textures
+                                             The graph cannot be cloned.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuOverlap
 
+                                           To set the control value, supply a default value when creating the handle and/or
 
-        Device can possibly copy memory and execute a kernel concurrently
+                                           call :py:obj:`~.cudaGraphSetConditional` from device code.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMultiProcessorCount
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeCount
 
+.. autoclass:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership
 
-        Number of multiprocessors on device
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipClone
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrKernelExecTimeout
+        Default behavior for a child graph node. Child graph is cloned into the parent and memory allocation/free nodes can't be present in the child graph.
 
 
-        Specifies whether there is a run time limit on kernels
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipMove
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIntegrated
+        The child graph is moved to the parent. The handle to the child graph is owned by the parent and will be destroyed when the parent is destroyed.
 
 
-        Device is integrated with host memory
 
+        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to cudaGraphExecUpdate; Cannot have additional memory allocation or free nodes added.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanMapHostMemory
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipInvalid
 
-        Device can map host memory into CUDA address space
 
+        Invalid ownership flag. Set when params are queried to prevent accidentally reusing the driver-owned graph object
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeMode
+.. autoclass:: cuda.bindings.runtime.cudaGraphDependencyType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeDefault
 
-        Compute mode (See :py:obj:`~.cudaComputeMode` for details)
 
+        This is an ordinary dependency.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeProgrammatic
 
-        Maximum 1D texture width
 
+        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DWidth
+.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResult
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateSuccess
 
-        Maximum 2D texture width
 
+        The update succeeded
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateError
 
-        Maximum 2D texture height
 
+        The update failed for an unexpected reason which is described in the return value of the function
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorTopologyChanged
 
-        Maximum 3D texture width
 
+        The update failed because the topology changed
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorNodeTypeChanged
 
-        Maximum 3D texture height
 
+        The update failed because a node type changed
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DDepth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorFunctionChanged
 
-        Maximum 3D texture depth
 
+        The update failed because the function of a kernel node changed (CUDA driver < 11.2)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorParametersChanged
 
-        Maximum 2D layered texture width
 
+        The update failed because the parameters changed in a way that is not supported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorNotSupported
 
-        Maximum 2D layered texture height
 
+        The update failed because something about the node is not supported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLayeredLayers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorUnsupportedFunctionChange
 
-        Maximum layers in a 2D layered texture
 
+        The update failed because the function of a kernel node changed in an unsupported way
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSurfaceAlignment
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorAttributesChanged
 
-        Alignment requirement for surfaces
 
+        The update failed because the node attributes changed in a way that is not supported
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrConcurrentKernels
+.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateResult
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateSuccess
 
-        Device can possibly execute multiple kernels concurrently
 
+        Instantiation succeeded
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrEccEnabled
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateError
 
-        Device has ECC support enabled
 
+        Instantiation failed for an unexpected reason which is described in the return value of the function
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciBusId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateInvalidStructure
 
-        PCI bus ID of the device
 
+        Instantiation failed due to invalid structure, such as cycles
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciDeviceId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateNodeOperationNotSupported
 
-        PCI device ID of the device
 
+        Instantiation for device launch failed because the graph contained an unsupported operation
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTccDriver
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateMultipleDevicesNotSupported
 
-        Device is using TCC driver model
 
+        Instantiation for device launch failed due to the nodes belonging to different contexts
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryClockRate
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateConditionalHandleUnused
 
-        Peak memory clock frequency in kilohertz
 
+        One or more conditional handles are not associated with conditional nodes
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGlobalMemoryBusWidth
+.. autoclass:: cuda.bindings.runtime.cudaGraphKernelNodeField
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldInvalid
 
-        Global memory bus width in bits
 
+        Invalid field
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrL2CacheSize
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldGridDim
 
-        Size of L2 cache in bytes
 
+        Grid dimension update
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerMultiProcessor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldParam
 
-        Maximum resident threads per multiprocessor
 
+        Kernel parameter update
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrAsyncEngineCount
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldEnabled
 
-        Number of asynchronous engines
 
+        Node enable/disable
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrUnifiedAddressing
+.. autoclass:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnableDefault
 
-        Device shares a unified address space with the host
 
+        Default search mode for driver symbols.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLayeredWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnableLegacyStream
 
-        Maximum 1D layered texture width
 
+        Search for legacy versions of driver symbols.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLayeredLayers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnablePerThreadDefaultStream
 
-        Maximum layers in a 1D layered texture
 
+        Search for per-thread versions of driver symbols.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherWidth
+.. autoclass:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointSuccess
 
-        Maximum 2D texture width if cudaArrayTextureGather is set
 
+        Search for symbol found a match
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointSymbolNotFound
 
-        Maximum 2D texture height if cudaArrayTextureGather is set
 
+        Search for symbol was not found
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DWidthAlt
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointVersionNotSufficent
 
-        Alternate maximum 3D texture width
 
+        Search for symbol was found but version wasn't great enough
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DHeightAlt
+.. autoclass:: cuda.bindings.runtime.cudaGraphDebugDotFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsVerbose
 
-        Alternate maximum 3D texture height
 
+        Output all debug data as if every debug flag is enabled
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DDepthAlt
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams
 
-        Alternate maximum 3D texture depth
 
+        Adds :py:obj:`~.cudaKernelNodeParams` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPciDomainId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams
 
-        PCI domain ID of the device
 
+        Adds :py:obj:`~.cudaMemcpy3DParms` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTexturePitchAlignment
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams
 
-        Pitch alignment requirement for textures
 
+        Adds :py:obj:`~.cudaMemsetParams` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams
 
-        Maximum cubemap texture width/height
 
+        Adds :py:obj:`~.cudaHostNodeParams` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapLayeredWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
 
-        Maximum cubemap layered texture width/height
 
+        Adds cudaEvent_t handle from record and wait nodes to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTextureCubemapLayeredLayers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
 
-        Maximum layers in a cubemap layered texture
 
+        Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams
 
-        Maximum 1D surface width
 
+        Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes
 
-        Maximum 2D surface width
 
+        Adds cudaKernelNodeAttrID values to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHandles
 
-        Maximum 2D surface height
 
+        Adds node handles and every kernel function handle to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams
 
-        Maximum 3D surface width
 
+        Adds :py:obj:`~.cudaConditionalNodeParams` to output
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DHeight
+.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagAutoFreeOnLaunch
 
-        Maximum 3D surface height
 
+        Automatically free memory allocated in a graph before relaunching.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface3DDepth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload
 
-        Maximum 3D surface depth
 
+        Automatically upload the graph after instantiation. Only supported by 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DLayeredWidth
+         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the 
 
+         stream provided in `instantiateParams`.
 
-        Maximum 1D layered surface width
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface1DLayeredLayers
 
+        Instantiate the graph to be launchable from the device. This flag can only 
 
-        Maximum layers in a 1D layered surface
+         be used on platforms which support unified addressing. This flag cannot be 
 
+         used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUseNodePriority
 
-        Maximum 2D layered surface width
 
+        Run the graph using the per-node priority attributes rather than the priority of the stream it is launched into.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredHeight
+.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomain
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchMemSyncDomain.cudaLaunchMemSyncDomainDefault
 
-        Maximum 2D layered surface height
 
+        Launch kernels in the default domain
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurface2DLayeredLayers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchMemSyncDomain.cudaLaunchMemSyncDomainRemote
 
-        Maximum layers in a 2D layered surface
 
+        Launch kernels in the remote domain
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapWidth
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeDefault
 
-        Maximum cubemap surface width
 
+        The default to use for allowing non-portable cluster size on launch - uses current function attribute for :py:obj:`~.cudaFuncAttributeNonPortableClusterSizeAllowed`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapLayeredWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeRequirePortable
 
-        Maximum cubemap layered surface width
 
+        Specifies that the cluster size requested must be a portable size
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSurfaceCubemapLayeredLayers
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeAllowNonPortable
 
-        Maximum layers in a cubemap layered surface
 
+        Specifies that the cluster size requested may be a non-portable size
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DLinearWidth
+.. autoclass:: cuda.bindings.runtime.cudaSharedMemoryMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault
 
-        Maximum 1D linear texture width
 
+        The default to use for allowing non-portable shared memory size on launch - uses current function attributes for :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable
 
-        Maximum 2D linear texture width
 
+        Specifies that the shared memory size requested must be a portable size within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable
 
-        Maximum 2D linear texture height
 
+        Specifies that the shared memory size requested may be a non-portable size up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DLinearPitch
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeID
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeIgnore
 
-        Maximum 2D linear texture pitch in bytes
 
+        Ignored entry, for convenient composition
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DMipmappedWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow
 
-        Maximum mipmapped 2D texture width
 
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DMipmappedHeight
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative
 
-        Maximum mipmapped 2D texture height
 
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::cooperative.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy
 
-        Major compute capability version number
 
+        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension
 
-        Minor compute capability version number
 
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterDim.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture1DMipmappedWidth
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference
 
-        Maximum mipmapped 1D texture width
 
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrStreamPrioritiesSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization
 
-        Device supports stream priorities
 
+        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGlobalL1CacheSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent
 
-        Device supports caching globals in L1
 
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrLocalL1CacheSupported
+         The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
 
-        Device supports caching locals in L1
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePriority
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerMultiprocessor
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::priority.
 
 
-        Maximum shared memory available per multiprocessor in bytes
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxRegistersPerMultiprocessor
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.
 
 
-        Maximum number of 32-bit registers available per multiprocessor
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrManagedMemory
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.
 
 
-        Device can allocate managed memory on this system
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIsMultiGpuBoard
+        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
 
+         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
 
-        Device is on a multi-GPU board
+         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMultiGpuBoardGroupID
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent
 
 
-        Unique identifier for a group of devices on the same multi-GPU board
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the event. 
 
+         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNativeAtomicSupported
+         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
 
+         The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
-        Link between the device and the host supports native atomic operations
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSingleToDoublePrecisionPerfRatio
 
+        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
 
-        Ratio of single precision performance (in floating-point operations per second) to double precision performance
+         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
 
+         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess
+         If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
 
-        Device supports coherently accessing pageable memory without calling cudaHostRegister on it
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess
+        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
 
 
-        Device can coherently access managed memory concurrently with the CPU
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrComputePreemptionSupported
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
 
 
-        Device supports Compute Preemption
 
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanUseHostPointerForRegisteredMem
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
 
+         Valid values for :py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled).
 
-        Device can access host registered memory at the same virtual address as the CPU
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved92
 
+        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved93
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSharedMemoryMode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved94
 
+        Valid for graph nodes, launches. This indicates that the kernel launch is allowed to use a non-portable shared memory mode.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCooperativeLaunch
+.. autoclass:: cuda.bindings.runtime.cudaDeviceNumaConfig
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNone
 
-        Device supports launching cooperative kernels via :py:obj:`~.cudaLaunchCooperativeKernel`
 
+        The GPU is not a NUMA node
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved96
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNumaNode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlockOptin
 
+        The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID
 
-        The maximum optin shared memory per block. This value may vary by chip. See :py:obj:`~.cudaFuncSetAttribute`
+.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaAsyncNotificationType.cudaAsyncNotificationTypeOverBudget
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCanFlushRemoteWrites
 
+        Sent when the process has exceeded its device memory budget
 
-        Device supports flushing of outstanding remote writes.
+.. autoclass:: cuda.bindings.runtime.cudaLogLevel
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelError
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostRegisterSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelWarning
 
-        Device supports host memory registration via :py:obj:`~.cudaHostRegister`.
+.. autoclass:: cuda.bindings.runtime.cudaSurfaceBoundaryMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeZero
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccessUsesHostPageTables
 
+        Zero boundary mode
 
-        Device accesses pageable memory via the host's page tables.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeClamp
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrDirectManagedMemAccessFromHost
 
+        Clamp boundary mode
 
-        Host can directly access managed memory on the device without migration.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeTrap
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxBlocksPerMultiprocessor
 
+        Trap boundary mode
 
-        Maximum number of blocks per multiprocessor
+.. autoclass:: cuda.bindings.runtime.cudaSurfaceFormatMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceFormatMode.cudaFormatModeForced
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxPersistingL2CacheSize
 
+        Forced format mode
 
-        Maximum L2 persisting lines capacity setting in bytes.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaSurfaceFormatMode.cudaFormatModeAuto
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxAccessPolicyWindowSize
 
+        Auto format mode
 
-        Maximum value of :py:obj:`~.cudaAccessPolicyWindow.num_bytes`.
+.. autoclass:: cuda.bindings.runtime.cudaTextureAddressMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeWrap
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReservedSharedMemoryPerBlock
 
+        Wrapping address mode
 
-        Shared memory reserved by CUDA driver per block in bytes
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeClamp
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrSparseCudaArraySupported
 
+        Clamp to edge address mode
 
-        Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeMirror
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostRegisterReadOnlySupported
 
+        Mirror address mode
 
-        Device supports using the :py:obj:`~.cudaHostRegister` flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureAddressMode.cudaAddressModeBorder
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTimelineSemaphoreInteropSupported
 
+        Border address mode
 
-        External timeline semaphore interop is supported on the device
+.. autoclass:: cuda.bindings.runtime.cudaTextureFilterMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureFilterMode.cudaFilterModePoint
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported
 
+        Point filter mode
 
-        Device supports using the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureFilterMode.cudaFilterModeLinear
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMASupported
 
+        Linear filter mode
 
-        Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+.. autoclass:: cuda.bindings.runtime.cudaTextureReadMode
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureReadMode.cudaReadModeElementType
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMAFlushWritesOptions
 
+        Read texture as specified element type
 
-        The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
 
+    .. autoattribute:: cuda.bindings.runtime.cudaTextureReadMode.cudaReadModeNormalizedFloat
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMAWritesOrdering
 
+        Read texture as normalized float
 
-        GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` for the numerical values returned here.
+.. autoclass:: cuda.bindings.runtime.cudaEglFrameType
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglFrameType.cudaEglFrameTypeArray
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolSupportedHandleTypes
 
+        Frame type CUDA array
 
-        Handle types supported with mempool based IPC
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglFrameType.cudaEglFrameTypePitch
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrClusterLaunch
 
+        Frame type CUDA pointer
 
-        Indicates device supports cluster launch
+.. autoclass:: cuda.bindings.runtime.cudaEglResourceLocationFlags
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglResourceLocationFlags.cudaEglResourceLocationSysmem
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrDeferredMappingCudaArraySupported
 
+        Resource location sysmem
 
-        Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglResourceLocationFlags.cudaEglResourceLocationVidmem
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved122
 
+        Resource location vidmem
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved123
+.. autoclass:: cuda.bindings.runtime.cudaEglColorFormat
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved124
 
+        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrIpcEventSupport
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar
 
-        Device supports IPC Events.
 
+        Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemSyncDomainCount
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422Planar
 
-        Number of memory synchronization domains the device supports.
 
+        Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved127
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved128
 
+        Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved129
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatARGB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrNumaConfig
 
+        R/G/B/A four channels in one surface with BGRA byte ordering.
 
-        NUMA configuration of a device: value is of type :py:obj:`~.cudaDeviceNumaConfig` enum
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatRGBA
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrNumaId
 
+        R/G/B/A four channels in one surface with ABGR byte ordering.
 
-        NUMA node ID of the GPU memory
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatL
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved132
 
+        single luminance channel in one surface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMpsEnabled
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatR
 
-        Contexts created on this device will be shared via MPS
 
+        single color channel in one surface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444Planar
 
-        NUMA ID of the host node closest to the device or -1 when system does not support NUMA
 
+        Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrD3D12CigSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444SemiPlanar
 
-        Device supports CIG with D3D12.
 
+        Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrVulkanCigSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUYV422
 
-        Device supports CIG with Vulkan.
 
+        Y, U, V in one surface, interleaved as UYVY in one channel.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuPciDeviceId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY422
 
-        The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
 
+        Y, U, V in one surface, interleaved as YUYV in one channel.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGpuPciSubsystemId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatABGR
 
-        The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
 
+        R/G/B/A four channels in one surface with RGBA byte ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved141
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBGRA
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMemoryPoolsSupported
 
+        R/G/B/A four channels in one surface with ARGB byte ordering.
 
-        Device supports HOST_NUMA location with the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatA
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMultinodeIpcSupported
 
+        Alpha color format - one channel in one surface.
 
-        Device supports HostNuma location IPC between nodes in a multi-node system.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatRG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported
 
+        R/G color format - two channels in one surface with GR byte ordering
 
-        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatAYUV
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved145
 
+        Y, U, V, A four channels in one surface, interleaved as VUYA.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrOnlyPartialHostNativeAtomicSupported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444SemiPlanar
 
-        Link between the device and the host supports only some native atomic operations
 
+        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMax
 
-.. autoclass:: cuda.bindings.runtime.cudaMemPoolAttr
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies
 
+        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
 
-        (value type = int) Allow cuMemAllocAsync to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic
 
+        Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        (value type = int) Allow reuse of already completed frees when there is no dependency between the free and allocation. (default enabled)
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies
 
+        Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
-        (value type = int) Allow cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by cuFreeAsync (default enabled).
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold
 
+        Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        (value type = cuuint64_t) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent
 
+        Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
-        (value type = cuuint64_t) Amount of backing memory currently allocated for the mempool.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh
 
+        Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        (value type = cuuint64_t) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatVYUY_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent
 
+        Extended Range Y, U, V in one surface, interleaved as YVYU in one channel.
 
-        (value type = cuuint64_t) Amount of memory from the pool that is currently in use by the application.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh
 
+        Extended Range Y, U, V in one surface, interleaved as YUYV in one channel.
 
-        (value type = cuuint64_t) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUYV_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType
 
+        Extended Range Y, U, V in one surface, interleaved as UYVY in one channel.
 
-        (value type = cudaMemAllocationType) The allocation type of the mempool
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVYU_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes
 
+        Extended Range Y, U, V in one surface, interleaved as VYUY in one channel.
 
-        (value type = cudaMemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always cudaMemHandleTypeNone as an imported pool cannot be re-exported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUVA_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationId
 
+        Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY.
 
-        (value type = int) The location id for the mempool. If the location type for this pool is cudaMemLocationTypeInvisible then ID will be cudaInvalidDeviceId
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatAYUV_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationType
 
+        Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA.
 
-        (value type = cudaMemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be cudaMemLocationTypeInvisible
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444Planar_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrMaxPoolSize
 
+        Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height.
 
-        (value type = cuuint64_t) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cudaMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For cudaMemAllocationTypeManaged and IPC imported pools this value will be system dependent.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422Planar_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrHwDecompressEnabled
 
+        Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height.
 
-        (value type = int) Indicates whether the pool has hardware compresssion enabled
 
-.. autoclass:: cuda.bindings.runtime.cudaMemLocationType
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvalid
 
+        Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV444SemiPlanar_ER
 
-        Location is unspecified. This is used when creating a managed memory pool to indicate no preferred location for the pool
 
+        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeDevice
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV422SemiPlanar_ER
 
-        Location is a device location, thus id is a device ordinal
 
+        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHost
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_ER
 
-        Location is host, id is ignored
 
+        Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHostNuma
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444Planar_ER
 
-        Location is a host NUMA node, thus id is a host NUMA node id
 
+        Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeHostNumaCurrent
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422Planar_ER
 
-        Location is the host NUMA node closest to the current thread's CPU, id is ignored
 
+        Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvisible
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_ER
 
-        Location is not visible but device is accessible, id is always cudaInvalidDeviceId
 
-.. autoclass:: cuda.bindings.runtime.cudaMemAccessFlags
+        Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444SemiPlanar_ER
 
-        Default, make the address range not accessible
 
+        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtRead
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422SemiPlanar_ER
 
-        Make the address range read accessible
 
+        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAccessFlags.cudaMemAccessFlagsProtReadWrite
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_ER
 
-        Make the address range read-write accessible
 
-.. autoclass:: cuda.bindings.runtime.cudaMemAllocationType
+        Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeInvalid
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerRGGB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypePinned
 
+        Bayer format - one channel in one surface with interleaved RGGB ordering.
 
-        This allocation type is 'pinned', i.e. cannot migrate from its current location while the application is actively using it
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerBGGR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeManaged
 
+        Bayer format - one channel in one surface with interleaved BGGR ordering.
 
-        This allocation type is managed memory
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerGRBG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeMax
 
-.. autoclass:: cuda.bindings.runtime.cudaMemAllocationHandleType
+        Bayer format - one channel in one surface with interleaved GRBG ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeNone
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerGBRG
 
-        Does not allow any export mechanism. >
 
+        Bayer format - one channel in one surface with interleaved GBRG ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypePosixFileDescriptor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10RGGB
 
-        Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int)
 
+        Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeWin32
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10BGGR
 
-        Allows a Win32 NT handle to be used for exporting. (HANDLE)
 
+        Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeWin32Kmt
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10GRBG
 
-        Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
 
+        Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10GBRG
 
-        Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphMemAttributeType
+        Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemCurrent
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12RGGB
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently associated with graphs.
 
+        Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemHigh
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12BGGR
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
 
+        Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemCurrent
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12GRBG
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
+        Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemHigh
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12GBRG
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
-.. autoclass:: cuda.bindings.runtime.cudaMemcpyFlags
+        Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyFlags.cudaMemcpyFlagDefault
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14RGGB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpyFlags.cudaMemcpyFlagPreferOverlapWithCompute
 
+        Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
-        Hint to the driver to try and overlap the copy with compute work on the SMs.
 
-.. autoclass:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14BGGR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderInvalid
 
+        Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
-        Default invalid.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14GRBG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderStream
 
+        Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
-        Indicates that access to the source pointer must be in stream order.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer14GBRG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderDuringApiCall
 
+        Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op.
 
-        Indicates that access to the source pointer can be out of stream order and all accesses must be complete before the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the need for the user to synchronize the stream after the API call.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20RGGB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderAny
 
+        Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
-        Indicates that access to the source pointer can be out of stream order and the accesses can happen even after the API call returns. This flag is suited for host pointers allocated outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory. Specifying this flag allows the driver to optimize the copy on certain platforms.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20BGGR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpySrcAccessOrder.cudaMemcpySrcAccessOrderMax
 
-.. autoclass:: cuda.bindings.runtime.cudaMemcpy3DOperandType
+        Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypePointer
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20GRBG
 
-        Memcpy operand is a valid pointer.
 
+        Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeArray
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer20GBRG
 
-        Memcpy operand is a CUarray.
 
+        Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeMax
 
-.. autoclass:: cuda.bindings.runtime.cudaDeviceP2PAttr
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU444Planar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrPerformanceRank
 
+        Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height.
 
-        A relative value indicating the performance of the link between two devices
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU422Planar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrAccessSupported
 
+        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height.
 
-        Peer access is enabled
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrNativeAtomicSupported
 
+        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Native atomic operation over the link supported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspRGGB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrCudaArrayAccessSupported
 
+        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype.
 
-        Accessing CUDA arrays over the link supported
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspBGGR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrOnlyPartialNativeAtomicSupported
 
+        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype.
 
-        Only some CUDA-valid atomic operations over the link are supported.
 
-.. autoclass:: cuda.bindings.runtime.cudaAtomicOperation
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspGRBG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerAdd
 
+        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMin
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerIspGBRG
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMax
 
+        Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerIncrement
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerBCCR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerDecrement
 
+        Bayer format - one channel in one surface with interleaved BCCR ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationAnd
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerRCCB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationOr
 
+        Bayer format - one channel in one surface with interleaved RCCB ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationXOR
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerCRBC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationExchange
 
+        Bayer format - one channel in one surface with interleaved CRBC ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationCAS
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayerCBRC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatAdd
 
+        Bayer format - one channel in one surface with interleaved CBRC ordering.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMin
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer10CCCC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMax
 
-.. autoclass:: cuda.bindings.runtime.cudaAtomicOperationCapability
+        Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilitySigned
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12BCCR
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityUnsigned
 
+        Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityReduction
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12RCCB
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar32
 
+        Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar64
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CRBC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar128
 
+        Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityVector32x4
 
-.. autoclass:: cuda.bindings.runtime.cudaExternalMemoryHandleType
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CBRC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueFd
 
+        Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-        Handle is an opaque file descriptor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatBayer12CCCC
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueWin32
 
+        Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op.
 
-        Handle is an opaque shared NT handle
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueWin32Kmt
 
+        Color format for single Y plane.
 
-        Handle is an opaque, globally shared handle
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D12Heap
 
+        Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a D3D12 heap object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D12Resource
 
+        Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a D3D12 committed resource
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D11Resource
 
+        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a shared NT handle to a D3D11 resource
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeD3D11ResourceKmt
 
+        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a globally shared handle to a D3D11 resource
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420SemiPlanar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeNvSciBuf
 
+        Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is an NvSciBuf object
 
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420SemiPlanar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueFd
 
+        Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is an opaque file descriptor
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUV420Planar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueWin32
 
+        Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is an opaque shared NT handle
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVU420Planar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
 
+        Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is an opaque, globally shared handle
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeD3D12Fence
 
+        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a shared NT handle referencing a D3D12 fence object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeD3D11Fence
 
+        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Handle is a shared NT handle referencing a D3D11 fence object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar_2020
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeNvSciSync
 
+        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
-        Opaque handle to NvSciSync Object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeKeyedMutex
 
+        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
-        Handle is a shared NT handle referencing a D3D11 keyed mutex object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_422SemiPlanar_709
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeKeyedMutexKmt
 
+        Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height.
 
-        Handle is a shared KMT handle referencing a D3D11 keyed mutex object
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
 
+        Extended Range Color format for single Y plane.
 
-        Handle is an opaque handle file descriptor referencing a timeline semaphore
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY_709_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreHandleType.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
 
+        Extended Range Color format for single Y plane.
 
-        Handle is an opaque handle file descriptor referencing a timeline semaphore
 
-.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupDefault
 
+        Extended Range Color format for single Y10 plane.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupBackfill
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10_709_ER
 
-        Lets smCount be a non-multiple of minCoscheduledCount, filling the difference with other SMs.
 
-.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags
+        Extended Range Color format for single Y10 plane.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitIgnoreSmCoscheduling
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitMaxPotentialClusterSize
 
-.. autoclass:: cuda.bindings.runtime.cudaDevResourceType
+        Extended Range Color format for single Y12 plane.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeInvalid
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12_709_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeSm
 
+        Extended Range Color format for single Y12 plane.
 
-        Streaming multiprocessors related information
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYUVA
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueueConfig
 
+        Y, U, V, A four channels in one surface, interleaved as AVUY.
 
-        Workqueue configuration related information
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatYVYU
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueue
 
+        Y, U, V in one surface, interleaved as YVYU in one channel.
 
-        Pre-existing workqueue related information
 
-.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatVYUY
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeDeviceCtx
 
+        Y, U, V in one surface, interleaved as VYUY in one channel.
 
-        Use all shared workqueue resources on the device. Default driver behaviour.
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeGreenCtxBalanced
 
+        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
-.. autoclass:: cuda.bindings.runtime.cudaJitOption
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMaxRegisters
 
+        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
-        Max number of registers that a thread may use.
 
-        Option type: unsigned int
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar_ER
 
-        Applies to: compiler only
 
+        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitThreadsPerBlock
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER
 
-        IN: Specifies minimum number of threads per block to target compilation for
 
-        OUT: Returns the number of threads the compiler actually targeted. This restricts the resource utilization of the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations. Note, this option does not currently take into account any other resource limitations, such as shared memory utilization.
+        Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
-        Option type: unsigned int
 
-        Applies to: compiler only
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar_ER
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitWallTime
+        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
 
-        Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER
 
-        Option type: float
 
-        Applies to: compiler and linker
+        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitInfoLogBuffer
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar_ER
 
 
-        Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.cudaJitInfoLogBufferSizeBytes`)
+        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
-        Option type: char *
 
-        Applies to: compiler and linker
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitInfoLogBufferSizeBytes
+        Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height.
 
 
-        IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY709
 
-        OUT: Amount of log buffer filled with messages
 
-        Option type: unsigned int
+        Y, U, V in one surface, interleaved as UYVY in one channel.
 
-        Applies to: compiler and linker
 
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY709_ER
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitErrorLogBuffer
 
+        Extended Range Y, U, V in one surface, interleaved as UYVY in one channel.
 
-        Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.cudaJitErrorLogBufferSizeBytes`)
 
-        Option type: char *
+    .. autoattribute:: cuda.bindings.runtime.cudaEglColorFormat.cudaEglColorFormatUYVY2020
 
-        Applies to: compiler and linker
 
+        Y, U, V in one surface, interleaved as UYVY in one channel.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitErrorLogBufferSizeBytes
+.. autoclass:: cuda.bindings.runtime.cudaDevResourceDesc_t
+.. autoclass:: cuda.bindings.runtime.cudaExecutionContext_t
+.. autoclass:: cuda.bindings.runtime.cudaArray_t
+.. autoclass:: cuda.bindings.runtime.cudaArray_const_t
+.. autoclass:: cuda.bindings.runtime.cudaMipmappedArray_t
+.. autoclass:: cuda.bindings.runtime.cudaMipmappedArray_const_t
+.. autoclass:: cuda.bindings.runtime.cudaHostFn_t
+.. autoclass:: cuda.bindings.runtime.CUuuid
+.. autoclass:: cuda.bindings.runtime.cudaUUID_t
+.. autoclass:: cuda.bindings.runtime.cudaIpcEventHandle_t
+.. autoclass:: cuda.bindings.runtime.cudaIpcMemHandle_t
+.. autoclass:: cuda.bindings.runtime.cudaMemFabricHandle_t
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams
+.. autoclass:: cuda.bindings.runtime.cudaDevResource
+.. autoclass:: cuda.bindings.runtime.cudaStream_t
+.. autoclass:: cuda.bindings.runtime.cudaEvent_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphicsResource_t
+.. autoclass:: cuda.bindings.runtime.cudaExternalMemory_t
+.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphore_t
+.. autoclass:: cuda.bindings.runtime.cudaGraph_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphNode_t
+.. autoclass:: cuda.bindings.runtime.cudaUserObject_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandle
+.. autoclass:: cuda.bindings.runtime.cudaFunction_t
+.. autoclass:: cuda.bindings.runtime.cudaKernel_t
+.. autoclass:: cuda.bindings.runtime.cudaLibrary_t
+.. autoclass:: cuda.bindings.runtime.cudaMemPool_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphEdgeData
+.. autoclass:: cuda.bindings.runtime.cudaGraphExec_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateParams
+.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResultInfo
+.. autoclass:: cuda.bindings.runtime.cudaGraphDeviceNode_t
+.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomainMap
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeValue
+.. autoclass:: cuda.bindings.runtime.cudaLaunchAttribute
+.. autoclass:: cuda.bindings.runtime.cudaAsyncCallbackHandle_t
+.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationInfo_t
+.. autoclass:: cuda.bindings.runtime.cudaAsyncCallback
+.. autoclass:: cuda.bindings.runtime.cudaLogsCallbackHandle
+.. autoclass:: cuda.bindings.runtime.cudaLogIterator
+.. autoclass:: cuda.bindings.runtime.cudaSurfaceObject_t
+.. autoclass:: cuda.bindings.runtime.cudaTextureObject_t
+.. autoclass:: cuda.bindings.runtime.cudaEglPlaneDesc
+.. autoclass:: cuda.bindings.runtime.cudaEglFrame
+.. autoclass:: cuda.bindings.runtime.cudaEglStreamConnection
+.. autoattribute:: cuda.bindings.runtime.cudaHostAllocDefault
 
+    Default page-locked allocation flag
 
-        IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
+.. autoattribute:: cuda.bindings.runtime.cudaHostAllocPortable
 
-        OUT: Amount of log buffer filled with messages
+    Pinned memory accessible by all CUDA contexts
 
-        Option type: unsigned int
+.. autoattribute:: cuda.bindings.runtime.cudaHostAllocMapped
 
-        Applies to: compiler and linker
+    Map allocation into device space
 
+.. autoattribute:: cuda.bindings.runtime.cudaHostAllocWriteCombined
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitOptimizationLevel
+    Write-combined memory
 
+.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterDefault
 
-        Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations.
+    Default host memory registration flag
 
-        Option type: unsigned int
+.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterPortable
 
-        Applies to: compiler only
+    Pinned memory accessible by all CUDA contexts
 
+.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterMapped
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitFallbackStrategy
+    Map registered memory into device space
 
+.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterIoMemory
 
-        Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied :py:obj:`~.cudaJit_Fallback`. Option type: unsigned int for enumerated type :py:obj:`~.cudaJit_Fallback`
+    Memory-mapped I/O space
 
-        Applies to: compiler only
+.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterReadOnly
 
+    Memory-mapped read-only
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitGenerateDebugInfo
+.. autoattribute:: cuda.bindings.runtime.cudaPeerAccessDefault
 
+    Default peer addressing enable flag
 
-        Specifies whether to create debug information in output (-g) (0: false, default)
+.. autoattribute:: cuda.bindings.runtime.cudaStreamDefault
 
-        Option type: int
+    Default stream flag
 
-        Applies to: compiler and linker
+.. autoattribute:: cuda.bindings.runtime.cudaStreamNonBlocking
 
+    Stream does not synchronize with stream 0 (the NULL stream)
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitLogVerbose
+.. autoattribute:: cuda.bindings.runtime.cudaStreamLegacy
 
+    Legacy stream handle
 
-        Generate verbose log messages (0: false, default)
 
-        Option type: int
 
-        Applies to: compiler and linker
+    Stream handle that can be passed as a cudaStream_t to use an implicit stream with legacy synchronization behavior.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitGenerateLineInfo
 
+    See details of the \link_sync_behavior
 
-        Generate line number information (-lineinfo) (0: false, default)
+.. autoattribute:: cuda.bindings.runtime.cudaStreamPerThread
 
-        Option type: int
+    Per-thread stream handle
 
-        Applies to: compiler only
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode
+    Stream handle that can be passed as a cudaStream_t to use an implicit stream with per-thread synchronization behavior.
 
 
-        Specifies whether to enable caching explicitly (-dlcm) 
 
-        Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`.
+    See details of the \link_sync_behavior
 
-        Option type: unsigned int for enumerated type :py:obj:`~.cudaJit_CacheMode`
+.. autoattribute:: cuda.bindings.runtime.cudaEventDefault
 
-        Applies to: compiler only
+    Default event flag
 
+.. autoattribute:: cuda.bindings.runtime.cudaEventBlockingSync
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitPositionIndependentCode
+    Event uses blocking synchronization
 
+.. autoattribute:: cuda.bindings.runtime.cudaEventDisableTiming
 
-        Generate position independent code (0: false)
+    Event will not record timing data
 
-        Option type: int
+.. autoattribute:: cuda.bindings.runtime.cudaEventInterprocess
 
-        Applies to: compiler only
+    Event is suitable for interprocess use. cudaEventDisableTiming must be set
 
+.. autoattribute:: cuda.bindings.runtime.cudaEventRecordDefault
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMinCtaPerSm
+    Default event record flag
 
+.. autoattribute:: cuda.bindings.runtime.cudaEventRecordExternal
 
-        This option hints to the JIT compiler the minimum number of CTAs from the kernel’s grid to be mapped to a SM. This option is ignored when used together with :py:obj:`~.cudaJitMaxRegisters` or :py:obj:`~.cudaJitThreadsPerBlock`. Optimizations based on this option need :py:obj:`~.cudaJitMaxThreadsPerBlock` to be specified as well. For kernels already using PTX directive .minnctapersm, this option will be ignored by default. Use :py:obj:`~.cudaJitOverrideDirectiveValues` to let this option take precedence over the PTX directive. Option type: unsigned int
+    Event is captured in the graph as an external event node when performing stream capture
 
-        Applies to: compiler only
+.. autoattribute:: cuda.bindings.runtime.cudaEventWaitDefault
 
+    Default event wait flag
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMaxThreadsPerBlock
+.. autoattribute:: cuda.bindings.runtime.cudaEventWaitExternal
 
+    Event is captured in the graph as an external event node when performing stream capture
 
-        Maximum number threads in a thread block, computed as the product of the maximum extent specifed for each dimension of the block. This limit is guaranteed not to be exeeded in any invocation of the kernel. Exceeding the the maximum number of threads results in runtime error or kernel launch failure. For kernels already using PTX directive .maxntid, this option will be ignored by default. Use :py:obj:`~.cudaJitOverrideDirectiveValues` to let this option take precedence over the PTX directive. Option type: int
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleAuto
 
-        Applies to: compiler only
+    Device flag - Automatic scheduling
 
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleSpin
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitOverrideDirectiveValues
+    Device flag - Spin default scheduling
 
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleYield
 
-        This option lets the values specified using :py:obj:`~.cudaJitMaxRegisters`, :py:obj:`~.cudaJitThreadsPerBlock`, :py:obj:`~.cudaJitMaxThreadsPerBlock` and :py:obj:`~.cudaJitMinCtaPerSm` take precedence over any PTX directives. (0: Disable, default; 1: Enable) Option type: int
+    Device flag - Yield default scheduling
 
-        Applies to: compiler only
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleBlockingSync
 
-.. autoclass:: cuda.bindings.runtime.cudaLibraryOption
+    Device flag - Use blocking synchronization
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryHostUniversalFunctionAndDataTable
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceBlockingSync
 
+    Device flag - Use blocking synchronization [Deprecated]
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryBinaryIsPreserved
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleMask
 
+    Device schedule flags mask
 
-        Specifes that the argument `code` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceMapHost
 
-.. autoclass:: cuda.bindings.runtime.cudaJit_CacheMode
+    Device flag - Support mapped pinned allocations
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionNone
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceLmemResizeToMax
 
+    Device flag - Keep local memory allocation after launch
 
-        Compile with no -dlcm flag specified
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceSyncMemops
 
+    Device flag - Ensure synchronous memory operations on this context will synchronize
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionCG
+.. autoattribute:: cuda.bindings.runtime.cudaDeviceMask
 
+    Device flags mask
 
-        Compile with L1 cache disabled
+.. autoattribute:: cuda.bindings.runtime.cudaArrayDefault
 
+    Default CUDA array allocation flag
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJit_CacheMode.cudaJitCacheOptionCA
+.. autoattribute:: cuda.bindings.runtime.cudaArrayLayered
 
+    Must be set in cudaMalloc3DArray to create a layered CUDA array
 
-        Compile with L1 cache enabled
+.. autoattribute:: cuda.bindings.runtime.cudaArraySurfaceLoadStore
 
-.. autoclass:: cuda.bindings.runtime.cudaJit_Fallback
+    Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJit_Fallback.cudaPreferPtx
+.. autoattribute:: cuda.bindings.runtime.cudaArrayCubemap
 
+    Must be set in cudaMalloc3DArray to create a cubemap CUDA array
 
-        Prefer to compile ptx if exact binary match not found
+.. autoattribute:: cuda.bindings.runtime.cudaArrayTextureGather
 
+    Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array
 
-    .. autoattribute:: cuda.bindings.runtime.cudaJit_Fallback.cudaPreferBinary
+.. autoattribute:: cuda.bindings.runtime.cudaArrayColorAttachment
 
+    Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API
 
-        Prefer to fall back to compatible binary code if exact match not found
+.. autoattribute:: cuda.bindings.runtime.cudaArraySparse
 
-.. autoclass:: cuda.bindings.runtime.cudaCGScope
+    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array
 
-    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeInvalid
+.. autoattribute:: cuda.bindings.runtime.cudaArrayDeferredMapping
 
+    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array
 
-        Invalid cooperative group scope
+.. autoattribute:: cuda.bindings.runtime.cudaIpcMemLazyEnablePeerAccess
 
+    Automatically enable peer access between remote devices as needed
 
-    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeGrid
+.. autoattribute:: cuda.bindings.runtime.cudaMemAttachGlobal
 
+    Memory can be accessed by any stream on any device
 
-        Scope represented by a grid_group
+.. autoattribute:: cuda.bindings.runtime.cudaMemAttachHost
 
+    Memory cannot be accessed by any stream on any device
 
-    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeReserved
+.. autoattribute:: cuda.bindings.runtime.cudaMemAttachSingle
 
+    Memory can only be accessed by a single stream on the associated device
 
-        Reserved
+.. autoattribute:: cuda.bindings.runtime.cudaOccupancyDefault
 
-.. autoclass:: cuda.bindings.runtime.cudaKernelFunctionType
+    Default behavior
 
-    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeUnspecified
+.. autoattribute:: cuda.bindings.runtime.cudaOccupancyDisableCachingOverride
 
+    Assume global caching is enabled and cannot be automatically turned off
 
-        CUDA will attempt to deduce the type of the function handle
+.. autoattribute:: cuda.bindings.runtime.cudaCpuDeviceId
 
+    Device id that represents the CPU
 
-    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeDeviceEntry
+.. autoattribute:: cuda.bindings.runtime.cudaInvalidDeviceId
 
+    Device id that represents an invalid device
 
-        Function handle is a device-entry function pointer(i.e. global function pointer)
+.. autoattribute:: cuda.bindings.runtime.cudaInitDeviceFlagsAreValid
 
+    Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice call
 
-    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel
+.. autoattribute:: cuda.bindings.runtime.cudaArraySparsePropertiesSingleMipTail
 
+    Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
 
-        Function handle is a cudaKernel_t
+.. autoattribute:: cuda.bindings.runtime.CUDART_CB
+.. autoattribute:: cuda.bindings.runtime.cudaMemPoolCreateUsageHwDecompress
 
+    This flag, if set, indicates that the memory will be used as a buffer for hardware accelerated decompression.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction
+.. autoattribute:: cuda.bindings.runtime.CU_UUID_HAS_BEEN_DEFINED
 
+    CUDA UUID types
 
-        Function handle is a cudaFunction_t
+.. autoattribute:: cuda.bindings.runtime.CUDA_IPC_HANDLE_SIZE
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags
+    CUDA IPC Handle Size
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags.cudaGraphCondAssignDefault
+.. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryDedicated
 
+    Indicates that the external memory object is a dedicated resource
 
-        Apply default handle value when graph is launched.
+.. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalNodeType
+    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeIf
+.. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreWaitSkipNvSciBufMemSync
 
+    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
-        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
+.. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrSignal
 
+    When /p flags of :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` is set to this, it indicates that application need signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cudaDeviceGetNvSciSyncAttributes`.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeWhile
+.. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrWait
 
+    When /p flags of :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` is set to this, it indicates that application need waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cudaDeviceGetNvSciSyncAttributes`.
 
-        Conditional 'while' Node. Body executed repeatedly while condition value is non-zero.
+.. autoattribute:: cuda.bindings.runtime.RESOURCE_ABI_BYTES
+.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortDefault
 
+    This port activates when the kernel has finished executing.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeSwitch
+.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortProgrammatic
 
+    This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion() or have terminated. It must be used with edge type :py:obj:`~.cudaGraphDependencyTypeProgrammatic`. See also :py:obj:`~.cudaLaunchAttributeProgrammaticEvent`.
 
-        Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched.
+.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortLaunchCompletion
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphNodeType
+    This port activates when all blocks of the kernel have begun execution. See also :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent`.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeKernel
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttrID
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeAccessPolicyWindow
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeSynchronizationPolicy
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeMemSyncDomainMap
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeMemSyncDomain
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributePriority
+.. autoattribute:: cuda.bindings.runtime.cudaStreamAttrValue
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttrID
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeAccessPolicyWindow
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeCooperative
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributePriority
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeClusterDimension
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeClusterSchedulingPolicyPreference
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeMemSyncDomainMap
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeMemSyncDomain
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributePreferredSharedMemoryCarveout
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeDeviceUpdatableKernelNode
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeNvlinkUtilCentricScheduling
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttrValue
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType1D
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType2D
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType3D
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceTypeCubemap
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType1DLayered
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType2DLayered
+.. autoattribute:: cuda.bindings.runtime.cudaSurfaceTypeCubemapLayered
+.. autoattribute:: cuda.bindings.runtime.cudaTextureType1D
+.. autoattribute:: cuda.bindings.runtime.cudaTextureType2D
+.. autoattribute:: cuda.bindings.runtime.cudaTextureType3D
+.. autoattribute:: cuda.bindings.runtime.cudaTextureTypeCubemap
+.. autoattribute:: cuda.bindings.runtime.cudaTextureType1DLayered
+.. autoattribute:: cuda.bindings.runtime.cudaTextureType2DLayered
+.. autoattribute:: cuda.bindings.runtime.cudaTextureTypeCubemapLayered
+.. autoattribute:: cuda.bindings.runtime.CUDA_EGL_MAX_PLANES
 
+    Maximum number of planes per frame
 
-        GPU kernel node
 
+Device Management
+-----------------
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemcpy
+impl_private
 
 
-        Memcpy node
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemset
 
 
-        Memset node
 
+This section describes the device management functions of the CUDA runtime application programming interface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeHost
+.. autofunction:: cuda.bindings.runtime.cudaDeviceReset
+.. autofunction:: cuda.bindings.runtime.cudaDeviceSynchronize
+.. autofunction:: cuda.bindings.runtime.cudaDeviceSetLimit
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetLimit
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetCacheConfig
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetStreamPriorityRange
+.. autofunction:: cuda.bindings.runtime.cudaDeviceSetCacheConfig
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetByPCIBusId
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetPCIBusId
+.. autofunction:: cuda.bindings.runtime.cudaIpcGetEventHandle
+.. autofunction:: cuda.bindings.runtime.cudaIpcOpenEventHandle
+.. autofunction:: cuda.bindings.runtime.cudaIpcGetMemHandle
+.. autofunction:: cuda.bindings.runtime.cudaIpcOpenMemHandle
+.. autofunction:: cuda.bindings.runtime.cudaIpcCloseMemHandle
+.. autofunction:: cuda.bindings.runtime.cudaDeviceRegisterAsyncNotification
+.. autofunction:: cuda.bindings.runtime.cudaDeviceUnregisterAsyncNotification
+.. autofunction:: cuda.bindings.runtime.cudaGetDeviceCount
+.. autofunction:: cuda.bindings.runtime.cudaGetDeviceProperties
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetHostAtomicCapabilities
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetDefaultMemPool
+.. autofunction:: cuda.bindings.runtime.cudaDeviceSetMemPool
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetMemPool
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetNvSciSyncAttributes
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAttribute
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAtomicCapabilities
+.. autofunction:: cuda.bindings.runtime.cudaChooseDevice
+.. autofunction:: cuda.bindings.runtime.cudaInitDevice
+.. autofunction:: cuda.bindings.runtime.cudaSetDevice
+.. autofunction:: cuda.bindings.runtime.cudaGetDevice
+.. autofunction:: cuda.bindings.runtime.cudaSetDeviceFlags
+.. autofunction:: cuda.bindings.runtime.cudaGetDeviceFlags
 
+Error Handling
+--------------
 
-        Host (executable) node
+This section describes the error handling functions of the CUDA runtime application programming interface.
 
+.. autofunction:: cuda.bindings.runtime.cudaGetLastError
+.. autofunction:: cuda.bindings.runtime.cudaPeekAtLastError
+.. autofunction:: cuda.bindings.runtime.cudaGetErrorName
+.. autofunction:: cuda.bindings.runtime.cudaGetErrorString
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeGraph
+Stream Management
+-----------------
 
+This section describes the stream management functions of the CUDA runtime application programming interface.
 
-        Node which executes an embedded graph
+.. autoclass:: cuda.bindings.runtime.cudaStreamCallback_t
+.. autofunction:: cuda.bindings.runtime.cudaStreamCreate
+.. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithFlags
+.. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithPriority
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetPriority
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetFlags
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetId
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetDevice
+.. autofunction:: cuda.bindings.runtime.cudaCtxResetPersistingL2Cache
+.. autofunction:: cuda.bindings.runtime.cudaStreamCopyAttributes
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaStreamSetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaStreamDestroy
+.. autofunction:: cuda.bindings.runtime.cudaStreamWaitEvent
+.. autofunction:: cuda.bindings.runtime.cudaStreamAddCallback
+.. autofunction:: cuda.bindings.runtime.cudaStreamSynchronize
+.. autofunction:: cuda.bindings.runtime.cudaStreamQuery
+.. autofunction:: cuda.bindings.runtime.cudaStreamAttachMemAsync
+.. autofunction:: cuda.bindings.runtime.cudaStreamBeginCapture
+.. autofunction:: cuda.bindings.runtime.cudaStreamBeginCaptureToGraph
+.. autofunction:: cuda.bindings.runtime.cudaThreadExchangeStreamCaptureMode
+.. autofunction:: cuda.bindings.runtime.cudaStreamEndCapture
+.. autofunction:: cuda.bindings.runtime.cudaStreamIsCapturing
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetCaptureInfo
+.. autofunction:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependencies
 
+Event Management
+----------------
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeEmpty
+This section describes the event management functions of the CUDA runtime application programming interface.
 
+.. autofunction:: cuda.bindings.runtime.cudaEventCreate
+.. autofunction:: cuda.bindings.runtime.cudaEventCreateWithFlags
+.. autofunction:: cuda.bindings.runtime.cudaEventRecord
+.. autofunction:: cuda.bindings.runtime.cudaEventQuery
+.. autofunction:: cuda.bindings.runtime.cudaEventSynchronize
+.. autofunction:: cuda.bindings.runtime.cudaEventDestroy
+.. autofunction:: cuda.bindings.runtime.cudaEventElapsedTime
 
-        Empty (no-op) node
+External Resource Interoperability
+----------------------------------
 
+This section describes the external resource interoperability functions of the CUDA runtime application programming interface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeWaitEvent
+.. autofunction:: cuda.bindings.runtime.cudaImportExternalMemory
+.. autofunction:: cuda.bindings.runtime.cudaExternalMemoryGetMappedBuffer
+.. autofunction:: cuda.bindings.runtime.cudaExternalMemoryGetMappedMipmappedArray
+.. autofunction:: cuda.bindings.runtime.cudaDestroyExternalMemory
+.. autofunction:: cuda.bindings.runtime.cudaImportExternalSemaphore
+.. autofunction:: cuda.bindings.runtime.cudaSignalExternalSemaphoresAsync
+.. autofunction:: cuda.bindings.runtime.cudaWaitExternalSemaphoresAsync
+.. autofunction:: cuda.bindings.runtime.cudaDestroyExternalSemaphore
 
+Execution Control
+-----------------
 
-        External event wait node
+This section describes the execution control functions of the CUDA runtime application programming interface.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeEventRecord
 
+Some functions have overloaded C++ API template versions documented separately in the C++ API Routines module.
 
-        External event record node
+.. autofunction:: cuda.bindings.runtime.cudaFuncSetCacheConfig
+.. autofunction:: cuda.bindings.runtime.cudaFuncGetAttributes
+.. autofunction:: cuda.bindings.runtime.cudaFuncSetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaFuncGetParamCount
+.. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc
+.. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc_v2
 
+Occupancy
+---------
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeExtSemaphoreSignal
+This section describes the occupancy calculation functions of the CUDA runtime application programming interface.
 
 
-        External semaphore signal node
 
+Besides the occupancy calculator functions (cudaOccupancyMaxActiveBlocksPerMultiprocessor and cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), there are also C++ only occupancy-based launch configuration functions documented in C++ API Routines module.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeExtSemaphoreWait
 
 
-        External semaphore wait node
+See cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API) cudaOccupancyAvailableDynamicSMemPerBlock (C++ API),
 
+.. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor
+.. autofunction:: cuda.bindings.runtime.cudaOccupancyAvailableDynamicSMemPerBlock
+.. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemAlloc
+Memory Management
+-----------------
 
+This section describes the memory management functions of the CUDA runtime application programming interface.
 
-        Memory allocation node
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeMemFree
+Some functions have overloaded C++ API template versions documented separately in the C++ API Routines module.
 
+.. autofunction:: cuda.bindings.runtime.cudaMallocManaged
+.. autofunction:: cuda.bindings.runtime.cudaMalloc
+.. autofunction:: cuda.bindings.runtime.cudaMallocHost
+.. autofunction:: cuda.bindings.runtime.cudaMallocPitch
+.. autofunction:: cuda.bindings.runtime.cudaMallocArray
+.. autofunction:: cuda.bindings.runtime.cudaFree
+.. autofunction:: cuda.bindings.runtime.cudaFreeHost
+.. autofunction:: cuda.bindings.runtime.cudaFreeArray
+.. autofunction:: cuda.bindings.runtime.cudaFreeMipmappedArray
+.. autofunction:: cuda.bindings.runtime.cudaHostAlloc
+.. autofunction:: cuda.bindings.runtime.cudaHostRegister
+.. autofunction:: cuda.bindings.runtime.cudaHostUnregister
+.. autofunction:: cuda.bindings.runtime.cudaHostGetDevicePointer
+.. autofunction:: cuda.bindings.runtime.cudaHostGetFlags
+.. autofunction:: cuda.bindings.runtime.cudaMalloc3D
+.. autofunction:: cuda.bindings.runtime.cudaMalloc3DArray
+.. autofunction:: cuda.bindings.runtime.cudaMallocMipmappedArray
+.. autofunction:: cuda.bindings.runtime.cudaGetMipmappedArrayLevel
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3D
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DPeer
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DPeerAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemGetInfo
+.. autofunction:: cuda.bindings.runtime.cudaArrayGetInfo
+.. autofunction:: cuda.bindings.runtime.cudaArrayGetPlane
+.. autofunction:: cuda.bindings.runtime.cudaArrayGetMemoryRequirements
+.. autofunction:: cuda.bindings.runtime.cudaMipmappedArrayGetMemoryRequirements
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy
+.. autofunction:: cuda.bindings.runtime.cudaMemcpyPeer
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2D
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DToArray
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DFromArray
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DArrayToArray
+.. autofunction:: cuda.bindings.runtime.cudaMemcpyAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpyPeerAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpyBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpyWithAttributesAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy3DWithAttributesAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DToArrayAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemcpy2DFromArrayAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemset
+.. autofunction:: cuda.bindings.runtime.cudaMemset2D
+.. autofunction:: cuda.bindings.runtime.cudaMemset3D
+.. autofunction:: cuda.bindings.runtime.cudaMemsetAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemset2DAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemset3DAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemDiscardBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemDiscardAndPrefetchBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemAdvise
+.. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttributes
+.. autofunction:: cuda.bindings.runtime.make_cudaPitchedPtr
+.. autofunction:: cuda.bindings.runtime.make_cudaPos
+.. autofunction:: cuda.bindings.runtime.make_cudaExtent
 
-        Memory free node
+Stream Ordered Memory Allocator
+-------------------------------
 
+**overview**
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeConditional
 
 
-        Conditional node                                    May be used to implement a conditional execution path or loop
+The asynchronous allocator allows the user to allocate and free in stream order. All asynchronous accesses of the allocation must happen between the stream executions of the allocation and the free. If the memory is accessed outside of the promised stream order, a use before allocation / use after free error will cause undefined behavior.
 
-                                           inside of a graph. The graph(s) contained within the body of the conditional node
+The allocator is free to reallocate the memory as long as it can guarantee that compliant memory accesses will not overlap temporally. The allocator may refer to internal stream ordering as well as inter-stream dependencies (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
 
-                                           can be selectively executed or iterated upon based on the value of a conditional
 
-                                           variable.
 
 
 
-                                           Handles must be created in advance of creating the node
+**Supported Platforms**
 
-                                           using :py:obj:`~.cudaGraphConditionalHandleCreate`.
 
 
+Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cudaDeviceGetAttribute() with the device attribute cudaDevAttrMemoryPoolsSupported.
 
-                                           The following restrictions apply to graphs which contain conditional nodes:
+.. autofunction:: cuda.bindings.runtime.cudaMallocAsync
+.. autofunction:: cuda.bindings.runtime.cudaFreeAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolTrimTo
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolSetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolSetAccess
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolGetAccess
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolCreate
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolDestroy
+.. autofunction:: cuda.bindings.runtime.cudaMemGetDefaultMemPool
+.. autofunction:: cuda.bindings.runtime.cudaMemGetMemPool
+.. autofunction:: cuda.bindings.runtime.cudaMemSetMemPool
+.. autofunction:: cuda.bindings.runtime.cudaMallocFromPoolAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolExportToShareableHandle
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolImportFromShareableHandle
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolExportPointer
+.. autofunction:: cuda.bindings.runtime.cudaMemPoolImportPointer
 
-                                             The graph cannot be used in a child node.
+Unified Addressing
+------------------
 
-                                             Only one instantiation of the graph may exist at any point in time.
+This section describes the unified addressing functions of the CUDA runtime application programming interface.
 
-                                             The graph cannot be cloned.
 
 
 
-                                           To set the control value, supply a default value when creating the handle and/or
 
-                                           call :py:obj:`~.cudaGraphSetConditional` from device code.
+**Overview**
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeCount
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership
+CUDA devices can share a unified address space with the host. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipClone
+ For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below).
 
 
-        Default behavior for a child graph node. Child graph is cloned into the parent and memory allocation/free nodes can't be present in the child graph.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipMove
 
+**Supported Platforms**
 
-        The child graph is moved to the parent. The handle to the child graph is owned by the parent and will be destroyed when the parent is destroyed.
 
 
+Whether or not a device supports unified addressing may be queried by calling cudaGetDeviceProperties() with the device property cudaDeviceProp::unifiedAddressing.
 
-        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to cudaGraphExecUpdate; Cannot have additional memory allocation or free nodes added.
+Unified addressing is automatically enabled in 64-bit processes .
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipInvalid
 
 
-        Invalid ownership flag. Set when params are queried to prevent accidentally reusing the driver-owned graph object
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphDependencyType
+**Looking Up Information from Pointer Values**
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeDefault
 
 
-        This is an ordinary dependency.
+It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes()
 
+Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeProgrammatic
+ The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
 
 
-        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResult
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateSuccess
 
+**Automatic Mapping of Host Allocated Host Memory**
 
-        The update succeeded
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateError
+All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified.
 
+The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. 
 
-        The update failed for an unexpected reason which is described in the return value of the function
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorTopologyChanged
+Note that this is not the case for memory allocated using the flag cudaHostAllocWriteCombined, as discussed below.
 
 
-        The update failed because the topology changed
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorNodeTypeChanged
 
+**Direct Access of Peer Memory**
 
-        The update failed because a node type changed
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorFunctionChanged
+Upon enabling direct access from a device that supports unified addressing to another peer device that supports unified addressing using cudaDeviceEnablePeerAccess() all memory allocated in the peer device using cudaMalloc() and cudaMallocPitch() will immediately be accessible by the current device. The device pointer value through which any peer's memory may be accessed in the current device is the same pointer value through which that memory may be accessed from the peer device.
 
 
-        The update failed because the function of a kernel node changed (CUDA driver < 11.2)
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorParametersChanged
 
+**Exceptions, Disjoint Addressing**
 
-        The update failed because the parameters changed in a way that is not supported
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorNotSupported
+Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. 
 
 
-        The update failed because something about the node is not supported
 
+This device address may be queried using cudaHostGetDevicePointer() when a device using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory in cudaMemcpy() and similar functions using the cudaMemcpyDefault memory direction.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorUnsupportedFunctionChange
+.. autofunction:: cuda.bindings.runtime.cudaPointerGetAttributes
 
+Peer Device Memory Access
+-------------------------
 
-        The update failed because the function of a kernel node changed in an unsupported way
+This section describes the peer device memory access functions of the CUDA runtime application programming interface.
 
+.. autofunction:: cuda.bindings.runtime.cudaDeviceCanAccessPeer
+.. autofunction:: cuda.bindings.runtime.cudaDeviceEnablePeerAccess
+.. autofunction:: cuda.bindings.runtime.cudaDeviceDisablePeerAccess
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphExecUpdateResult.cudaGraphExecUpdateErrorAttributesChanged
+OpenGL Interoperability
+-----------------------
 
+impl_private
 
-        The update failed because the node attributes changed in a way that is not supported
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateResult
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateSuccess
+This section describes the OpenGL interoperability functions of the CUDA runtime application programming interface. Note that mapping of OpenGL resources is performed with the graphics API agnostic, resource mapping interface described in Graphics Interopability.
 
+.. autoclass:: cuda.bindings.runtime.cudaGLDeviceList
 
-        Instantiation succeeded
+    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListAll
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateError
+        The CUDA devices for all GPUs used by the current OpenGL context
 
 
-        Instantiation failed for an unexpected reason which is described in the return value of the function
+    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListCurrentFrame
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateInvalidStructure
+        The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame
 
 
-        Instantiation failed due to invalid structure, such as cycles
+    .. autoattribute:: cuda.bindings.runtime.cudaGLDeviceList.cudaGLDeviceListNextFrame
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateNodeOperationNotSupported
+        The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame
 
+.. autofunction:: cuda.bindings.runtime.cudaGLGetDevices
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterImage
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterBuffer
 
-        Instantiation for device launch failed because the graph contained an unsupported operation
+Direct3D 9 Interoperability
+---------------------------
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateMultipleDevicesNotSupported
 
 
-        Instantiation for device launch failed due to the nodes belonging to different contexts
+Direct3D 10 Interoperability
+----------------------------
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateResult.cudaGraphInstantiateConditionalHandleUnused
 
 
-        One or more conditional handles are not associated with conditional nodes
+Direct3D 11 Interoperability
+----------------------------
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphKernelNodeField
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldInvalid
 
 
-        Invalid field
+VDPAU Interoperability
+----------------------
 
+This section describes the VDPAU interoperability functions of the CUDA runtime application programming interface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldGridDim
+.. autofunction:: cuda.bindings.runtime.cudaVDPAUGetDevice
+.. autofunction:: cuda.bindings.runtime.cudaVDPAUSetVDPAUDevice
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterVideoSurface
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterOutputSurface
 
+EGL Interoperability
+--------------------
 
-        Grid dimension update
+This section describes the EGL interoperability functions of the CUDA runtime application programming interface.
 
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsEGLRegisterImage
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerConnect
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerConnectWithFlags
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerDisconnect
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerAcquireFrame
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamConsumerReleaseFrame
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerConnect
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerDisconnect
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerPresentFrame
+.. autofunction:: cuda.bindings.runtime.cudaEGLStreamProducerReturnFrame
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedEglFrame
+.. autofunction:: cuda.bindings.runtime.cudaEventCreateFromEGLSync
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldParam
+Graphics Interoperability
+-------------------------
 
+This section describes the graphics interoperability functions of the CUDA runtime application programming interface.
 
-        Kernel parameter update
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsUnregisterResource
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceSetMapFlags
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsMapResources
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsUnmapResources
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedPointer
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsSubResourceGetMappedArray
+.. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedMipmappedArray
 
+Texture Object Management
+-------------------------
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodeField.cudaGraphKernelNodeFieldEnabled
+This section describes the low level texture object management functions of the CUDA runtime application programming interface. The texture object API is only supported on devices of compute capability 3.0 or higher.
 
+.. autofunction:: cuda.bindings.runtime.cudaGetChannelDesc
+.. autofunction:: cuda.bindings.runtime.cudaCreateChannelDesc
+.. autofunction:: cuda.bindings.runtime.cudaCreateTextureObject
+.. autofunction:: cuda.bindings.runtime.cudaDestroyTextureObject
+.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectResourceDesc
+.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectTextureDesc
+.. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectResourceViewDesc
 
-        Node enable/disable
+Surface Object Management
+-------------------------
 
-.. autoclass:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags
+This section describes the low level texture object management functions of the CUDA runtime application programming interface. The surface object API is only supported on devices of compute capability 3.0 or higher.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnableDefault
+.. autofunction:: cuda.bindings.runtime.cudaCreateSurfaceObject
+.. autofunction:: cuda.bindings.runtime.cudaDestroySurfaceObject
+.. autofunction:: cuda.bindings.runtime.cudaGetSurfaceObjectResourceDesc
 
+Version Management
+------------------
 
-        Default search mode for driver symbols.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnableLegacyStream
+.. autofunction:: cuda.bindings.runtime.cudaDriverGetVersion
+.. autofunction:: cuda.bindings.runtime.cudaRuntimeGetVersion
+.. autofunction:: cuda.bindings.runtime.getLocalRuntimeVersion
 
+Error Log Management Functions
+------------------------------
 
-        Search for legacy versions of driver symbols.
+This section describes the error log management functions of the CUDA runtime application programming interface. The Error Log Management interface will operate on both the CUDA Driver and CUDA Runtime.
 
+.. autoclass:: cuda.bindings.runtime.cudaLogsCallback_t
+.. autofunction:: cuda.bindings.runtime.cudaLogsRegisterCallback
+.. autofunction:: cuda.bindings.runtime.cudaLogsUnregisterCallback
+.. autofunction:: cuda.bindings.runtime.cudaLogsCurrent
+.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToFile
+.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToMemory
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGetDriverEntryPointFlags.cudaEnablePerThreadDefaultStream
+Graph Management
+----------------
 
+This section describes the graph management functions of CUDA runtime application programming interface.
 
-        Search for per-thread versions of driver symbols.
+.. autofunction:: cuda.bindings.runtime.cudaGraphCreate
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddKernelNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeCopyAttributes
+.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeSetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemcpyNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemsetNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddHostNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphHostNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphHostNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddChildGraphNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphChildGraphNodeGetGraph
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddEmptyNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphClone
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeFindInClone
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetType
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetContainingGraph
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetLocalId
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetToolsId
+.. autofunction:: cuda.bindings.runtime.cudaGraphGetId
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecGetId
+.. autofunction:: cuda.bindings.runtime.cudaGraphGetNodes
+.. autofunction:: cuda.bindings.runtime.cudaGraphGetRootNodes
+.. autofunction:: cuda.bindings.runtime.cudaGraphGetEdges
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependencies
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependentNodes
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddDependencies
+.. autofunction:: cuda.bindings.runtime.cudaGraphRemoveDependencies
+.. autofunction:: cuda.bindings.runtime.cudaGraphDestroyNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiate
+.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecGetFlags
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecKernelNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemcpyNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemsetNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecHostNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecUpdate
+.. autofunction:: cuda.bindings.runtime.cudaGraphLaunch
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecDestroy
+.. autofunction:: cuda.bindings.runtime.cudaGraphDestroy
+.. autofunction:: cuda.bindings.runtime.cudaGraphDebugDotPrint
+.. autofunction:: cuda.bindings.runtime.cudaUserObjectCreate
+.. autofunction:: cuda.bindings.runtime.cudaUserObjectRetain
+.. autofunction:: cuda.bindings.runtime.cudaUserObjectRelease
+.. autofunction:: cuda.bindings.runtime.cudaGraphRetainUserObject
+.. autofunction:: cuda.bindings.runtime.cudaGraphReleaseUserObject
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate
+.. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate_v2
 
-.. autoclass:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult
+Driver Entry Point Access
+-------------------------
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointSuccess
+This section describes the driver entry point access functions of CUDA runtime application programming interface.
 
+.. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPoint
+.. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPointByVersion
 
-        Search for symbol found a match
+Library Management
+------------------
 
+This section describes the library management functions of the CUDA runtime application programming interface.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointSymbolNotFound
+.. autofunction:: cuda.bindings.runtime.cudaLibraryLoadData
+.. autofunction:: cuda.bindings.runtime.cudaLibraryLoadFromFile
+.. autofunction:: cuda.bindings.runtime.cudaLibraryUnload
+.. autofunction:: cuda.bindings.runtime.cudaLibraryGetKernel
+.. autofunction:: cuda.bindings.runtime.cudaLibraryGetGlobal
+.. autofunction:: cuda.bindings.runtime.cudaLibraryGetManaged
+.. autofunction:: cuda.bindings.runtime.cudaLibraryGetUnifiedFunction
+.. autofunction:: cuda.bindings.runtime.cudaLibraryGetKernelCount
+.. autofunction:: cuda.bindings.runtime.cudaLibraryEnumerateKernels
+.. autofunction:: cuda.bindings.runtime.cudaKernelSetAttributeForDevice
 
+Execution Context Management
+----------------------------
 
-        Search for symbol was not found
+This section describes the execution context management functions of the CUDA runtime application programming interface.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDriverEntryPointQueryResult.cudaDriverEntryPointVersionNotSufficent
 
 
-        Search for symbol was found but version wasn't great enough
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphDebugDotFlags
+**Overview**
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsVerbose
 
 
-        Output all debug data as if every debug flag is enabled
+A CUDA execution context cudaExecutionContext_t serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
 
+There are two primary ways today to obtain an execution context:
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams
+- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
 
 
-        Adds :py:obj:`~.cudaKernelNodeParams` to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams
 
 
-        Adds :py:obj:`~.cudaMemcpy3DParms` to output
 
+- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams
 
 
-        Adds :py:obj:`~.cudaMemsetParams` to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams
 
 
-        Adds :py:obj:`~.cudaHostNodeParams` to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
+Once you have an execution context at hand, you can perform context-level operations via the CUDA Runtime APIs. This includes:
 
+- Submitting work via streams created with cudaExecutionCtxStreamCreate.
 
-        Adds cudaEvent_t handle from record and wait nodes to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
 
 
-        Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams
+- Querying context via cudaExecutionCtxGetDevResource, cudaExecutionCtxGetDevice, etc.
 
 
-        Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes
 
 
-        Adds cudaKernelNodeAttrID values to output
 
+- Synchronizing and tracking context-level operations via cudaExecutionCtxSynchronize, cudaExecutionCtxRecordEvent, cudaExecutionCtxWaitEvent.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHandles
 
 
-        Adds node handles and every kernel function handle to output
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams
 
 
-        Adds :py:obj:`~.cudaConditionalNodeParams` to output
+- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
 
-.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateFlags
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagAutoFreeOnLaunch
 
 
-        Automatically free memory allocated in a graph before relaunching.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload
 
 
-        Automatically upload the graph after instantiation. Only supported by 
 
-         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the 
+Note: The above APIs take in an explicit cudaExecutionContext_t handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return cudaErrorInvalidValue.
 
-         stream provided in `instantiateParams`.
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ::CUcontext or ::CUgreenCtx.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch
 
 
-        Instantiate the graph to be launchable from the device. This flag can only 
 
-         be used on platforms which support unified addressing. This flag cannot be 
+**Lifetime of CUDA Resources**
 
-         used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUseNodePriority
+The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of cudaDeviceReset() should be avoided to persist the lifetime of these resources.
 
 
-        Run the graph using the per-node priority attributes rather than the priority of the stream it is launched into.
 
-.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomain
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchMemSyncDomain.cudaLaunchMemSyncDomainDefault
 
+**APIs Operating on Current Context**
 
-        Launch kernels in the default domain
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchMemSyncDomain.cudaLaunchMemSyncDomainRemote
+The CUDA runtime does not provide a way to set an execution context as current. Since, the majority of the runtime APIs operate on the current context, we document below how the developer can work with these APIs.
 
 
-        Launch kernels in the remote domain
 
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode
+**APIs Operating on Device Resources**
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeDefault
 
 
-        The default to use for allowing non-portable cluster size on launch - uses current function attribute for :py:obj:`~.cudaFuncAttributeNonPortableClusterSizeAllowed`
+To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), developers are expected to call cudaSetDevice() prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using cudaExecutionCtxGetDevice().
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeRequirePortable
 
 
-        Specifies that the cluster size requested must be a portable size
 
+**APIs Operating on Context Resources**
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributePortableClusterMode.cudaLaunchPortableClusterModeAllowNonPortable
 
 
-        Specifies that the cluster size requested may be a non-portable size
+These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See cudaExecutionCtxStreamCreate for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
 
-.. autoclass:: cuda.bindings.runtime.cudaSharedMemoryMode
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault
 
 
-        The default to use for allowing non-portable shared memory size on launch - uses current function attributes for :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable
 
+**Green Contexts**
 
-        Specifies that the shared memory size requested must be a portable size within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable
+Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
 
+Here are the broad initial steps to follow to get started:
 
-        Specifies that the shared memory size requested may be a non-portable size up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`
+- (1) Start with an initial set of resources. For SM resources, they can be fetched via cudaDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cudaDeviceGetDevResource API.
 
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeID
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeIgnore
 
 
-        Ignored entry, for convenient composition
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow
 
+- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cudaDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.cooperative`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy
+- (3) Finalize the specification of resources by creating a descriptor via cudaDevResourceGenerateDesc.
 
 
-        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
 
+- (4) Create a green context via cudaGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization
 
 
-        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed` to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
+- (5) Create a stream via cudaExecutionCtxStreamCreate, and use it throughout your application.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
 
-         The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePriority
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.priority`.
+SMs
 
+There are two possible partition operations - with cudaDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cudaDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cudaDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap
+- On Compute Architecture 7.X, 8.X, and all Tegra SoC:
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain
 
+  - The smCount must be a multiple of 2.
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension
 
 
-        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
 
-         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
 
-         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
+  - The alignment (and default value of coscheduledSmCount) is 2.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. 
 
-         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
 
-         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
 
-         The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode
+- On Compute Architecture 9.0+:
 
 
-        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
 
-         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
 
-         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. 
 
-         If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
+  - The smCount must be a multiple of 8, or coscheduledSmCount if provided.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout
 
 
-        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling
 
+  - The alignment (and default value of coscheduledSmCount) is 8. While the maximum value for coscheduled SM count is 32 on all Compute Architecture 9.0+, it's recommended to follow cluster size requirements. The portable cluster size and the max cluster size should be used in order to benefit from this co-scheduling.
 
-        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
 
 
 
-         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
 
-         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
 
-         Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode
 
 
-        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
 
+Workqueues
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSharedMemoryMode
+For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
 
+- ``cudaDevWorkqueueConfigScopeDeviceCtx:``\  Use all shared workqueue resources across all contexts (default driver behavior).
 
-        Valid for graph nodes, launches. This indicates that the kernel launch is allowed to use a non-portable shared memory mode.
 
-.. autoclass:: cuda.bindings.runtime.cudaDeviceNumaConfig
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNone
 
 
-        The GPU is not a NUMA node
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNumaNode
+- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
 
-        The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID
 
-.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationType
 
-    .. autoattribute:: cuda.bindings.runtime.cudaAsyncNotificationType.cudaAsyncNotificationTypeOverBudget
 
 
-        Sent when the process has exceeded its device memory budget
 
-.. autoclass:: cuda.bindings.runtime.cudaLogLevel
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelError
 
+The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
-    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelWarning
+For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
 
-.. autoclass:: cuda.bindings.runtime.cudaTextureObject_t
-.. autoclass:: cuda.bindings.runtime.cudaSurfaceObject_t
-.. autoclass:: cuda.bindings.runtime.cudaEglPlaneDesc
-.. autoclass:: cuda.bindings.runtime.cudaEglFrame
-.. autoclass:: cuda.bindings.runtime.cudaEglStreamConnection
-.. autoclass:: cuda.bindings.runtime.cudaDevResourceDesc_t
-.. autoclass:: cuda.bindings.runtime.cudaExecutionContext_t
-.. autoclass:: cuda.bindings.runtime.cudaArray_t
-.. autoclass:: cuda.bindings.runtime.cudaArray_const_t
-.. autoclass:: cuda.bindings.runtime.cudaMipmappedArray_t
-.. autoclass:: cuda.bindings.runtime.cudaMipmappedArray_const_t
-.. autoclass:: cuda.bindings.runtime.cudaHostFn_t
-.. autoclass:: cuda.bindings.runtime.CUuuid
-.. autoclass:: cuda.bindings.runtime.cudaUUID_t
-.. autoclass:: cuda.bindings.runtime.cudaIpcEventHandle_t
-.. autoclass:: cuda.bindings.runtime.cudaIpcMemHandle_t
-.. autoclass:: cuda.bindings.runtime.cudaMemFabricHandle_t
-.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams
-.. autoclass:: cuda.bindings.runtime.cudaDevResource
-.. autoclass:: cuda.bindings.runtime.cudaStream_t
-.. autoclass:: cuda.bindings.runtime.cudaEvent_t
-.. autoclass:: cuda.bindings.runtime.cudaGraphicsResource_t
-.. autoclass:: cuda.bindings.runtime.cudaExternalMemory_t
-.. autoclass:: cuda.bindings.runtime.cudaExternalSemaphore_t
-.. autoclass:: cuda.bindings.runtime.cudaGraph_t
-.. autoclass:: cuda.bindings.runtime.cudaGraphNode_t
-.. autoclass:: cuda.bindings.runtime.cudaUserObject_t
-.. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandle
-.. autoclass:: cuda.bindings.runtime.cudaFunction_t
-.. autoclass:: cuda.bindings.runtime.cudaKernel_t
-.. autoclass:: cuda.bindings.runtime.cudaLibrary_t
-.. autoclass:: cuda.bindings.runtime.cudaMemPool_t
-.. autoclass:: cuda.bindings.runtime.cudaGraphEdgeData
-.. autoclass:: cuda.bindings.runtime.cudaGraphExec_t
-.. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateParams
-.. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResultInfo
-.. autoclass:: cuda.bindings.runtime.cudaGraphDeviceNode_t
-.. autoclass:: cuda.bindings.runtime.cudaLaunchMemSyncDomainMap
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeValue
-.. autoclass:: cuda.bindings.runtime.cudaLaunchAttribute
-.. autoclass:: cuda.bindings.runtime.cudaAsyncCallbackHandle_t
-.. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationInfo_t
-.. autoclass:: cuda.bindings.runtime.cudaAsyncCallback
-.. autoclass:: cuda.bindings.runtime.cudaLogsCallbackHandle
-.. autoclass:: cuda.bindings.runtime.cudaLogIterator
-.. autoattribute:: cuda.bindings.runtime.cudaTextureType1D
-.. autoattribute:: cuda.bindings.runtime.cudaTextureType2D
-.. autoattribute:: cuda.bindings.runtime.cudaTextureType3D
-.. autoattribute:: cuda.bindings.runtime.cudaTextureTypeCubemap
-.. autoattribute:: cuda.bindings.runtime.cudaTextureType1DLayered
-.. autoattribute:: cuda.bindings.runtime.cudaTextureType2DLayered
-.. autoattribute:: cuda.bindings.runtime.cudaTextureTypeCubemapLayered
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType1D
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType2D
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType3D
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceTypeCubemap
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType1DLayered
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceType2DLayered
-.. autoattribute:: cuda.bindings.runtime.cudaSurfaceTypeCubemapLayered
-.. autoattribute:: cuda.bindings.runtime.CUDA_EGL_MAX_PLANES
+On Concurrency
 
-    Maximum number of planes per frame
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced``\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostAllocDefault
+Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
-    Default page-locked allocation flag
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostAllocPortable
 
-    Pinned memory accessible by all CUDA contexts
+- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostAllocMapped
 
-    Map allocation into device space
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostAllocWriteCombined
 
-    Write-combined memory
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterDefault
 
-    Default host memory registration flag
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterPortable
+- On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future kernels running under green contexts may use and share an additional set of 2 SMs.
 
-    Pinned memory accessible by all CUDA contexts
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetDevResource
+.. autofunction:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount
+.. autofunction:: cuda.bindings.runtime.cudaDevSmResourceSplit
+.. autofunction:: cuda.bindings.runtime.cudaDevResourceGenerateDesc
+.. autofunction:: cuda.bindings.runtime.cudaGreenCtxCreate
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxDestroy
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetDevResource
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetDevice
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxGetId
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxStreamCreate
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxSynchronize
+.. autofunction:: cuda.bindings.runtime.cudaStreamGetDevResource
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxRecordEvent
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxWaitEvent
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetExecutionCtx
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterMapped
+C++ API Routines
+----------------
+C++-style interface built on top of CUDA runtime API.
+impl_private
 
-    Map registered memory into device space
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterIoMemory
 
-    Memory-mapped I/O space
 
-.. autoattribute:: cuda.bindings.runtime.cudaHostRegisterReadOnly
 
-    Memory-mapped read-only
 
-.. autoattribute:: cuda.bindings.runtime.cudaPeerAccessDefault
 
-    Default peer addressing enable flag
+This section describes the C++ high level API functions of the CUDA runtime application programming interface. To use these functions, your application needs to be compiled with the ``nvcc``\  compiler.
 
-.. autoattribute:: cuda.bindings.runtime.cudaStreamDefault
 
-    Default stream flag
+Interactions with the CUDA Driver API
+-------------------------------------
 
-.. autoattribute:: cuda.bindings.runtime.cudaStreamNonBlocking
+This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
 
-    Stream does not synchronize with stream 0 (the NULL stream)
 
-.. autoattribute:: cuda.bindings.runtime.cudaStreamLegacy
 
-    Legacy stream handle
 
 
+**Execution Contexts**
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with legacy synchronization behavior.
 
 
+The CUDA Runtime provides cudaExecutionContext_t as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
 
-    See details of the \link_sync_behavior
+There are two primary ways to obtain an execution context:
 
-.. autoattribute:: cuda.bindings.runtime.cudaStreamPerThread
+- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
 
-    Per-thread stream handle
 
 
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with per-thread synchronization behavior.
 
 
 
-    See details of the \link_sync_behavior
+- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventDefault
 
-    Default event flag
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventBlockingSync
 
-    Event uses blocking synchronization
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventDisableTiming
 
-    Event will not record timing data
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventInterprocess
 
-    Event is suitable for interprocess use. cudaEventDisableTiming must be set
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventRecordDefault
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ::CUcontext or ::CUgreenCtx.
 
-    Default event record flag
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventRecordExternal
 
-    Event is captured in the graph as an external event node when performing stream capture
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventWaitDefault
 
-    Default event wait flag
+**Primary Context (aka Device Execution Context)**
 
-.. autoattribute:: cuda.bindings.runtime.cudaEventWaitExternal
 
-    Event is captured in the graph as an external event node when performing stream capture
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleAuto
+The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to cudaDeviceGetExecutionCtx(). There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
 
-    Device flag - Automatic scheduling
+From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleSpin
+Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ::cuCtxSetCurrent()) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
 
-    Device flag - Spin default scheduling
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleYield
 
-    Device flag - Yield default scheduling
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleBlockingSync
 
-    Device flag - Use blocking synchronization
+**Initialization and Tear-Down**
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceBlockingSync
 
-    Device flag - Use blocking synchronization [Deprecated]
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleMask
+Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ::CUcontext which is current to the calling host thread. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
 
-    Device schedule flags mask
+The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceMapHost
+The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
 
-    Device flag - Support mapped pinned allocations
+Primary contexts will remain active until they are explicitly deinitialized using cudaDeviceReset(). The function cudaDeviceReset() will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceLmemResizeToMax
+Note that primary contexts are shared resources. It is recommended that the primary context not be reset except just before exit or to recover from an unspecified launch failure.
 
-    Device flag - Keep local memory allocation after launch
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceSyncMemops
 
-    Device flag - Ensure synchronous memory operations on this context will synchronize
 
-.. autoattribute:: cuda.bindings.runtime.cudaDeviceMask
 
-    Device flags mask
+**CUcontext Interoperability**
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayDefault
 
-    Default CUDA array allocation flag
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayLayered
+Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
-    Must be set in cudaMalloc3DArray to create a layered CUDA array
+If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
 
-.. autoattribute:: cuda.bindings.runtime.cudaArraySurfaceLoadStore
+The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary CUcontext is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
 
-    Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array
+All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayCubemap
+Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
 
-    Must be set in cudaMalloc3DArray to create a cubemap CUDA array
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayTextureGather
 
-    Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayColorAttachment
 
-    Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API
+**Interactions between CUstream and cudaStream_t**
 
-.. autoattribute:: cuda.bindings.runtime.cudaArraySparse
 
-    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array
 
-.. autoattribute:: cuda.bindings.runtime.cudaArrayDeferredMapping
+The types ::CUstream and cudaStream_t are identical and may be used interchangeably.
 
-    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array
 
-.. autoattribute:: cuda.bindings.runtime.cudaIpcMemLazyEnablePeerAccess
 
-    Automatically enable peer access between remote devices as needed
 
-.. autoattribute:: cuda.bindings.runtime.cudaMemAttachGlobal
 
-    Memory can be accessed by any stream on any device
+**Interactions between CUevent and cudaEvent_t**
 
-.. autoattribute:: cuda.bindings.runtime.cudaMemAttachHost
 
-    Memory cannot be accessed by any stream on any device
 
-.. autoattribute:: cuda.bindings.runtime.cudaMemAttachSingle
+The types ::CUevent and cudaEvent_t are identical and may be used interchangeably.
 
-    Memory can only be accessed by a single stream on the associated device
 
-.. autoattribute:: cuda.bindings.runtime.cudaOccupancyDefault
 
-    Default behavior
 
-.. autoattribute:: cuda.bindings.runtime.cudaOccupancyDisableCachingOverride
 
-    Assume global caching is enabled and cannot be automatically turned off
+**Interactions between CUarray and cudaArray_t**
 
-.. autoattribute:: cuda.bindings.runtime.cudaCpuDeviceId
 
-    Device id that represents the CPU
 
-.. autoattribute:: cuda.bindings.runtime.cudaInvalidDeviceId
+The types ::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other.
 
-    Device id that represents an invalid device
+In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
 
-.. autoattribute:: cuda.bindings.runtime.cudaInitDeviceFlagsAreValid
+In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
 
-    Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice call
 
-.. autoattribute:: cuda.bindings.runtime.cudaArraySparsePropertiesSingleMipTail
 
-    Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
 
-.. autoattribute:: cuda.bindings.runtime.CUDART_CB
-.. autoattribute:: cuda.bindings.runtime.cudaMemPoolCreateUsageHwDecompress
 
-    This flag, if set, indicates that the memory will be used as a buffer for hardware accelerated decompression.
+**Interactions between CUgraphicsResource and cudaGraphicsResource_t**
 
-.. autoattribute:: cuda.bindings.runtime.CU_UUID_HAS_BEEN_DEFINED
 
-    CUDA UUID types
 
-.. autoattribute:: cuda.bindings.runtime.CUDA_IPC_HANDLE_SIZE
+The types ::CUgraphicsResource and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-    CUDA IPC Handle Size
+In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource to a cudaGraphicsResource_t.
 
-.. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryDedicated
+In order to use a cudaGraphicsResource_t in a CUDA Driver API function which takes a ::CUgraphicsResource, it is necessary to explicitly cast the cudaGraphicsResource_t to a ::CUgraphicsResource.
 
-    Indicates that the external memory object is a dedicated resource
 
-.. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
-.. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreWaitSkipNvSciBufMemSync
 
-    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+**Interactions between CUtexObject and cudaTextureObject_t**
 
-.. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrSignal
 
-    When /p flags of :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` is set to this, it indicates that application need signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cudaDeviceGetNvSciSyncAttributes`.
 
-.. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrWait
+The types ::CUtexObject and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-    When /p flags of :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` is set to this, it indicates that application need waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cudaDeviceGetNvSciSyncAttributes`.
+In order to use a ::CUtexObject in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ::CUtexObject to a cudaTextureObject_t.
 
-.. autoattribute:: cuda.bindings.runtime.RESOURCE_ABI_BYTES
-.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortDefault
+In order to use a cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject, it is necessary to explicitly cast the cudaTextureObject_t to a ::CUtexObject.
 
-    This port activates when the kernel has finished executing.
 
-.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortProgrammatic
 
-    This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion() or have terminated. It must be used with edge type :py:obj:`~.cudaGraphDependencyTypeProgrammatic`. See also :py:obj:`~.cudaLaunchAttributeProgrammaticEvent`.
 
-.. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortLaunchCompletion
 
-    This port activates when all blocks of the kernel have begun execution. See also :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent`.
+**Interactions between CUsurfObject and cudaSurfaceObject_t**
 
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttrID
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeAccessPolicyWindow
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeSynchronizationPolicy
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeMemSyncDomainMap
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributeMemSyncDomain
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttributePriority
-.. autoattribute:: cuda.bindings.runtime.cudaStreamAttrValue
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttrID
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeAccessPolicyWindow
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeCooperative
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributePriority
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeClusterDimension
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeClusterSchedulingPolicyPreference
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeMemSyncDomainMap
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeMemSyncDomain
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributePreferredSharedMemoryCarveout
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeDeviceUpdatableKernelNode
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeNvlinkUtilCentricScheduling
-.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttrValue
+
+
+The types ::CUsurfObject and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
+
+In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ::CUsurfObject to a cudaSurfaceObject_t.
+
+In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject, it is necessary to explicitly cast the cudaSurfaceObject_t to a ::CUsurfObject.
+
+
+
+
+
+**Interactions between CUfunction and cudaFunction_t**
+
+
+
+The types ::CUfunction and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
+
+In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction, it is necessary to explicitly cast the cudaFunction_t to a ::CUfunction.
+
+
+
+
+
+**Interactions between CUkernel and cudaKernel_t**
+
+
+
+The types ::CUkernel and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
+
+In order to use a cudaKernel_t in a CUDA Driver API function which takes a ::CUkernel, it is necessary to explicitly cast the cudaKernel_t to a ::CUkernel.
+
+.. autofunction:: cuda.bindings.runtime.cudaGetKernel
+
+Profiler Control
+----------------
+
+This section describes the profiler control functions of the CUDA runtime application programming interface.
+
+.. autofunction:: cuda.bindings.runtime.cudaProfilerStart
+.. autofunction:: cuda.bindings.runtime.cudaProfilerStop