diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3016a..4eb81c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xptxas -O3") ################ # Core Library # diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu index ac31a1c..b305c72 100644 --- a/genmetaballs/src/cuda/bindings.cu +++ b/genmetaballs/src/cuda/bindings.cu @@ -228,6 +228,17 @@ NB_MODULE(_genmetaballs_bindings, m) { bind_array2d(utils, "CPUFloatArray2D"); bind_array2d(utils, "GPUFloatArray2D"); + // bind dim3, which is used to specify the launch configuration for the kernel + nb::class_(utils, "dim3") + .def(nb::init(), nb::arg("x") = 1, nb::arg("y") = 1, + nb::arg("z") = 1) + .def_prop_ro("x", [](const dim3& self) { return self.x; }) + .def_prop_ro("y", [](const dim3& self) { return self.y; }) + .def_prop_ro("z", [](const dim3& self) { return self.z; }) + .def("__repr__", [](const dim3& self) { + return nb::str("dim3(x={}, y={}, z={})").format(self.x, self.y, self.z); + }); + } // NB_MODULE(_genmetaballs_bindings) template @@ -319,5 +330,5 @@ void bind_render_fmbs(nb::module_& m, const char* name) { &render_fmbs, LinearIntersector, Blender, Confidence>, "Render the given FMB scene into the provided image view", nb::arg("fmbs"), nb::arg("blender"), nb::arg("confidence"), nb::arg("intr"), nb::arg("extr"), - nb::arg("img")); + nb::arg("img"), nb::arg("grid_size"), nb::arg("block_size")); } diff --git a/genmetaballs/src/cuda/core/camera.cu b/genmetaballs/src/cuda/core/camera.cu index a3d133e..def98a3 100644 --- a/genmetaballs/src/cuda/core/camera.cu +++ b/genmetaballs/src/cuda/core/camera.cu @@ -26,9 +26,10 @@ CUDA_CALLABLE PixelCoordRange::Iterator& PixelCoordRange::Iterator::operator++() return *this; } -CUDA_CALLABLE bool PixelCoordRange::Sentinel::operator==(const Iterator& it) const { +CUDA_CALLABLE bool operator!=(const PixelCoordRange::Iterator& it, + const PixelCoordRange::Sentinel& sentinel) { // stop if we reach the end of rows, or if the range is empty - return it.py >= py_end || it.px_start >= it.px_end || it.py_start >= py_end; + return it.py < sentinel.py_end && it.px_start < it.px_end && it.py_start < sentinel.py_end; } CUDA_CALLABLE PixelCoordRange::Iterator PixelCoordRange::begin() const { diff --git a/genmetaballs/src/cuda/core/camera.cuh b/genmetaballs/src/cuda/core/camera.cuh index eaac978..c395247 100644 --- a/genmetaballs/src/cuda/core/camera.cuh +++ b/genmetaballs/src/cuda/core/camera.cuh @@ -49,11 +49,11 @@ struct PixelCoordRange { // the Sentinel class only needs to hold the stop value (i.e. final row) struct Sentinel { uint32_t py_end; - - // stopping criterion: true if current row (py) reaches py_end - CUDA_CALLABLE bool operator==(const Iterator& it) const; }; + // stopping criterion: true if current row (py) reaches py_end + friend CUDA_CALLABLE bool operator!=(const Iterator& it, const Sentinel& sentinel); + // range methods CUDA_CALLABLE Iterator begin() const; CUDA_CALLABLE Sentinel end() const; diff --git a/genmetaballs/src/cuda/core/forward.cuh b/genmetaballs/src/cuda/core/forward.cuh index 2e03170..164ba0c 100644 --- a/genmetaballs/src/cuda/core/forward.cuh +++ b/genmetaballs/src/cuda/core/forward.cuh @@ -9,10 +9,6 @@ #include "image.cuh" #include "utils.cuh" -// TODO: tune this number -constexpr auto NUM_BLOCKS = dim3(4, 4); -constexpr auto THREADS_PER_BLOCK = dim3(16, 16); - CUDA_CALLABLE PixelCoordRange get_pixel_coords(const dim3 thread_idx, const dim3 block_idx, const dim3 block_dim, const dim3 grid_dim, const Intrinsics& intr); @@ -49,7 +45,8 @@ __global__ void render_kernel(const FMBScene& fmbs, cons template void render_fmbs(const FMBScene& fmbs, const Blender& blender, const Confidence& confidence, const Intrinsics& intr, const Pose& extr, - ImageView img) { + ImageView img, const dim3 grid_size, + const dim3 block_size) { render_kernel - <<>>(fmbs, blender, confidence, intr, extr, img); + <<>>(fmbs, blender, confidence, intr, extr, img); } diff --git a/genmetaballs/src/genmetaballs/core/__init__.py b/genmetaballs/src/genmetaballs/core/__init__.py index a90aa5d..51a2afb 100644 --- a/genmetaballs/src/genmetaballs/core/__init__.py +++ b/genmetaballs/src/genmetaballs/core/__init__.py @@ -12,7 +12,12 @@ ) from genmetaballs._genmetaballs_bindings.fmb import FMB, CPUFMBScene, GPUFMBScene from genmetaballs._genmetaballs_bindings.image import CPUImage, GPUImage -from genmetaballs._genmetaballs_bindings.utils import CPUFloatArray2D, GPUFloatArray2D, sigmoid +from genmetaballs._genmetaballs_bindings.utils import ( + CPUFloatArray2D, + GPUFloatArray2D, + dim3, + sigmoid, +) type DeviceType = Literal["cpu", "gpu"] @@ -83,6 +88,8 @@ def render_fmbs( intr: Intrinsics, extr: geometry.Pose, img: GPUImage | None = None, + grid_size: dim3 = dim3(4, 4), + block_size: dim3 = dim3(16, 16), ) -> GPUImage: """Render the given FMB scene into the provided image view. @@ -105,7 +112,7 @@ def render_fmbs( else: raise TypeError("Unsupported blender and confidence combination.") - render_func(fmbs, blender, confidence, intr, extr, img.as_view()) + render_func(fmbs, blender, confidence, intr, extr, img.as_view(), grid_size, block_size) return img @@ -122,6 +129,7 @@ def render_fmbs( "Camera", "FourParameterBlender", "FMB", + "dim3", "Intrinsics", "ThreeParameterBlender", "TwoParameterConfidence", diff --git a/tests/cpp_tests/test_confidence.cu b/tests/cpp_tests/test_confidence.cu index bc7b990..6693628 100644 --- a/tests/cpp_tests/test_confidence.cu +++ b/tests/cpp_tests/test_confidence.cu @@ -101,7 +101,7 @@ TEST(GpuConfidenceTest, ConfidenceMultipleValuesGPU_AllTypes) { std::vector actual; if (conf_case.is_two_param) { - TwoParameterConfidence conf(conf_case.beta4, conf_case.beta5); + TwoParameterConfidence conf{conf_case.beta4, conf_case.beta5}; actual = gpu_get_confidence(sumexpd_vec, conf); } else { ZeroParameterConfidence conf;