diff --git a/source/isaaclab/isaaclab/physics/physics_manager.py b/source/isaaclab/isaaclab/physics/physics_manager.py index 5b94ede481d5..5676fa8ef02b 100644 --- a/source/isaaclab/isaaclab/physics/physics_manager.py +++ b/source/isaaclab/isaaclab/physics/physics_manager.py @@ -265,6 +265,17 @@ def step(cls) -> None: """Step physics simulation by one timestep (physics only, no rendering).""" pass + @classmethod + def pre_render(cls) -> None: + """Sync deferred physics state to the rendering backend. + + Called by :meth:`~isaaclab.sim.SimulationContext.render` before cameras + and visualizers read scene data. The default implementation is a no-op. + Backends that defer transform writes (e.g. Newton's dirty-flag pattern) + should override this to flush pending updates. + """ + pass + @classmethod def close(cls) -> None: """Clean up physics resources. diff --git a/source/isaaclab/isaaclab/sim/simulation_context.py b/source/isaaclab/isaaclab/sim/simulation_context.py index 1d8551137f93..bb1daf6b0e5d 100644 --- a/source/isaaclab/isaaclab/sim/simulation_context.py +++ b/source/isaaclab/isaaclab/sim/simulation_context.py @@ -620,6 +620,7 @@ def render(self, mode: int | None = None) -> None: every physics step). Camera sensors drive their configured renderer when fetching data, so this method remains backend-agnostic. """ + self.physics_manager.pre_render() self.update_visualizers(self.get_rendering_dt()) # Call render callbacks diff --git a/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py new file mode 100644 index 000000000000..f9c4d0da4454 --- /dev/null +++ b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py @@ -0,0 +1,268 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Pure-Python ctypes bindings for the cubric GPU transform-hierarchy API. + +Acquires the ``omni::cubric::IAdapter`` carb interface directly from the +Carbonite framework and wraps its function-pointer methods so that Newton +can call cubric's GPU transform propagation without C++ pybind11 changes. + +The flow mirrors PhysX's ``DirectGpuHelper::updateXForms_GPU()``: + +1. ``IAdapter::create`` → allocate a cubric adapter ID +2. ``IAdapter::bindToStage`` → bind to the current Fabric stage +3. ``IAdapter::compute`` → GPU kernel: propagate world transforms +4. ``IAdapter::release`` → free the adapter + +When cubric is unavailable (e.g. CPU-only machine, plugin not loaded), the +caller falls back to the CPU ``update_world_xforms()`` path. +""" + +from __future__ import annotations + +import ctypes +import logging + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Carb Framework struct layout (CARB_ABI function-pointer offsets, x86_64) +# --------------------------------------------------------------------------- +# Counting only CARB_ABI fields from the top of ``struct Framework``: +# 0: loadPluginsEx +# 8: unloadAllPlugins +# 16: acquireInterfaceWithClient +# 24: tryAcquireInterfaceWithClient ← we use this one +_FW_OFF_TRY_ACQUIRE = 24 + +# --------------------------------------------------------------------------- +# IAdapter struct layout (from omni/cubric/IAdapter.h) +# --------------------------------------------------------------------------- +# 0: getAttribute +# 8: create(AdapterId*) +# 16: refcount +# 24: retain +# 32: release(AdapterId) +# 40: bindToStage(AdapterId, const FabricId&) +# 48: unbind +# 56: compute(AdapterId, options, dirtyMode, outFlags*) +_IA_OFF_CREATE = 8 +_IA_OFF_RELEASE = 32 +_IA_OFF_BIND = 40 +_IA_OFF_COMPUTE = 56 + +# AdapterId sentinel +_INVALID_ADAPTER_ID = ctypes.c_uint64(~0).value + +# AdapterComputeOptions flags (from IAdapter.h) +_OPT_FORCE_UPDATE = 1 << 0 # Force update, ignoring invalidation status +_OPT_FORCE_STATE_RECONSTRUCTION = 1 << 1 # Force full rebuild of internal accel structures +_OPT_SKIP_ISOLATED = 1 << 2 # Skip prims with connectivity degree 0 +_OPT_RIGID_BODY = 1 << 3 # Use PhysicsRigidBodyAPI tag for inverse propagation + +# Newton prims get tagged with PhysicsRigidBodyAPI at init time so +# cubric's eRigidBody mode can distinguish rigid-body buckets +# (Inverse: preserve world matrix written by Newton, derive local) +# from non-rigid-body buckets (Forward: propagate to children). +# eForceUpdate is ORed in to bypass the change-listener check. +_OPT_DEFAULT = _OPT_RIGID_BODY | _OPT_FORCE_UPDATE + +# AdapterDirtyMode +_DIRTY_ALL = 0 # eAll — dirty all prims in the stage +_DIRTY_COARSE = 1 # eCoarse — dirty all prims in visited buckets + + +# --------------------------------------------------------------------------- +# ctypes struct mirrors +# --------------------------------------------------------------------------- +class _Version(ctypes.Structure): + _fields_ = [("major", ctypes.c_uint32), ("minor", ctypes.c_uint32)] + + +class _InterfaceDesc(ctypes.Structure): + """``carb::InterfaceDesc`` — {const char* name, Version version}.""" + _fields_ = [ + ("name", ctypes.c_char_p), + ("version", _Version), + ] + + +def _read_u64(addr: int) -> int: + return ctypes.c_uint64.from_address(addr).value + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- +class CubricBindings: + """Typed wrappers around the cubric ``IAdapter`` API. + + Call :meth:`initialize` once; if it returns ``True``, the four adapter + methods are available. + """ + + def __init__(self) -> None: + self._ia_ptr: int = 0 + self._create_fn = None + self._release_fn = None + self._bind_fn = None + self._compute_fn = None + + # -- lifecycle ----------------------------------------------------------- + + def initialize(self) -> bool: + """Acquire the cubric ``IAdapter`` from the carb framework.""" + # Ensure the omni.cubric extension (native carb plugin) is loaded. + try: + import omni.kit.app + + ext_mgr = omni.kit.app.get_app().get_extension_manager() + if not ext_mgr.is_extension_enabled("omni.cubric"): + ext_mgr.set_extension_enabled_immediate("omni.cubric", True) + if not ext_mgr.is_extension_enabled("omni.cubric"): + logger.warning("Failed to enable omni.cubric extension") + return False + except Exception as exc: + logger.warning("Cannot enable omni.cubric: %s", exc) + return False + + # Get Framework* via libcarb.so acquireFramework (singleton). + try: + libcarb = ctypes.CDLL("libcarb.so") + except OSError: + logger.warning("Could not load libcarb.so") + return False + + libcarb.acquireFramework.restype = ctypes.c_void_p + libcarb.acquireFramework.argtypes = [ctypes.c_char_p, _Version] + fw_ptr = libcarb.acquireFramework(b"isaaclab.cubric", _Version(0, 0)) + if not fw_ptr: + logger.warning("acquireFramework returned null") + return False + + # Read tryAcquireInterfaceWithClient fn-ptr from Framework vtable. + try_acquire_addr = _read_u64(fw_ptr + _FW_OFF_TRY_ACQUIRE) + if try_acquire_addr == 0: + logger.warning("tryAcquireInterfaceWithClient is null in Framework") + return False + + try_acquire_fn = ctypes.CFUNCTYPE( + ctypes.c_void_p, # return: void* (IAdapter*) + ctypes.c_char_p, # clientName + _InterfaceDesc, # desc (by value) + ctypes.c_char_p, # pluginName + )(try_acquire_addr) + + desc = _InterfaceDesc( + name=b"omni::cubric::IAdapter", + version=_Version(0, 1), + ) + + # Try several acquisition strategies — the required client name + # varies across Kit configurations. + ia_ptr = try_acquire_fn(b"carb.scripting-python.plugin", desc, None) + if not ia_ptr: + ia_ptr = try_acquire_fn(None, desc, None) + if not ia_ptr: + acquire_addr = _read_u64(fw_ptr + 16) # acquireInterfaceWithClient + if acquire_addr: + acquire_fn = ctypes.CFUNCTYPE( + ctypes.c_void_p, + ctypes.c_char_p, + _InterfaceDesc, + ctypes.c_char_p, + )(acquire_addr) + ia_ptr = acquire_fn(b"isaaclab.cubric", desc, None) + if not ia_ptr: + logger.warning( + "Could not acquire omni::cubric::IAdapter — " + "cubric plugin may not be registered or interface version mismatch" + ) + return False + self._ia_ptr = ia_ptr + + # Wrap the four IAdapter function pointers we need. + create_addr = _read_u64(ia_ptr + _IA_OFF_CREATE) + release_addr = _read_u64(ia_ptr + _IA_OFF_RELEASE) + bind_addr = _read_u64(ia_ptr + _IA_OFF_BIND) + compute_addr = _read_u64(ia_ptr + _IA_OFF_COMPUTE) + + if not all([create_addr, release_addr, bind_addr, compute_addr]): + logger.warning("One or more IAdapter function pointers are null") + return False + + self._create_fn = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.POINTER(ctypes.c_uint64), + )(create_addr) + + self._release_fn = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_uint64, + )(release_addr) + + # FabricId is uint64, passed by const-ref -> pointer on x86_64 + self._bind_fn = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_uint64, ctypes.POINTER(ctypes.c_uint64), + )(bind_addr) + + self._compute_fn = ctypes.CFUNCTYPE( + ctypes.c_bool, + ctypes.c_uint64, # adapterId + ctypes.c_uint32, # options (AdapterComputeOptions) + ctypes.c_int32, # dirtyMode (AdapterDirtyMode) + ctypes.c_void_p, # outAccountFlags* (nullable) + )(compute_addr) + + logger.info("cubric IAdapter bindings ready") + return True + + @property + def available(self) -> bool: + return self._ia_ptr != 0 + + # -- cubric adapter methods ---------------------------------------------- + + def create_adapter(self) -> int | None: + """Create a cubric adapter. Returns an adapter ID or ``None``.""" + if not self._create_fn: + return None + adapter_id = ctypes.c_uint64(_INVALID_ADAPTER_ID) + ok = self._create_fn(ctypes.byref(adapter_id)) + if not ok or adapter_id.value == _INVALID_ADAPTER_ID: + logger.warning("IAdapter::create failed") + return None + return adapter_id.value + + def bind_to_stage(self, adapter_id: int, fabric_id: int) -> bool: + """Bind the adapter to a Fabric stage.""" + if not self._bind_fn: + return False + fid = ctypes.c_uint64(fabric_id) + ok = self._bind_fn(adapter_id, ctypes.byref(fid)) + if not ok: + logger.warning("IAdapter::bindToStage failed (adapter=%d, fabricId=%d)", adapter_id, fabric_id) + return ok + + def compute(self, adapter_id: int) -> bool: + """Run the GPU transform-hierarchy compute pass. + + Uses ``eRigidBody | eForceUpdate`` with ``eAll`` dirty mode. + ``eRigidBody`` makes cubric apply Inverse propagation on buckets + tagged with ``PhysicsRigidBodyAPI`` (keeps Newton's world transforms, + derives local) and Forward on everything else (propagates to children). + ``eForceUpdate`` bypasses the change-listener dirty check. + """ + if not self._compute_fn: + return False + flags = ctypes.c_uint32(0) + ok = self._compute_fn(adapter_id, _OPT_DEFAULT, _DIRTY_ALL, ctypes.byref(flags)) + if not ok: + logger.warning("IAdapter::compute returned false (flags=0x%x)", flags.value) + return ok + + def release_adapter(self, adapter_id: int) -> None: + """Release an adapter.""" + if not adapter_id or not self._release_fn: + return + self._release_fn(adapter_id) diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py index a8f7d1ba786c..ca0c8b52eac1 100644 --- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py +++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py @@ -7,12 +7,24 @@ from __future__ import annotations +import ctypes import inspect import logging from typing import TYPE_CHECKING import numpy as np import warp as wp + +# Load CUDA runtime for relaxed-mode graph capture (RTX-compatible). +# cudaStreamCaptureModeRelaxed (2) allows the RTX compositor's background +# CUDA stream to keep running during capture without invalidating it. +try: + _cudart = ctypes.CDLL("libcudart.so.12") +except OSError: + try: + _cudart = ctypes.CDLL("libcudart.so") + except OSError: + _cudart = None from newton import Axis, CollisionPipeline, Contacts, Control, Model, ModelBuilder, State, eval_fk from newton._src.usd.schemas import SchemaResolverNewton, SchemaResolverPhysx from newton.sensors import SensorContact as NewtonContactSensor @@ -83,12 +95,18 @@ class NewtonManager(PhysicsManager): # CUDA graphing _graph = None + _graph_capture_pending: bool = False # USD/Fabric sync _newton_stage_path = None _usdrt_stage = None _newton_index_attr = "newton:index" _clone_physics_only = False + _transforms_dirty: bool = False + + # cubric GPU transform hierarchy (replaces CPU update_world_xforms) + _cubric = None + _cubric_adapter: int | None = None # Model changes (callbacks use unified system from PhysicsManager) _model_changes: set[int] = set() @@ -139,49 +157,114 @@ def forward(cls) -> None: """Update articulation kinematics without stepping physics.""" eval_fk(cls._model, cls._state_0.joint_q, cls._state_0.joint_qd, cls._state_0, None) + @classmethod + def pre_render(cls) -> None: + """Flush deferred Fabric writes before cameras/visualizers read the scene.""" + cls.sync_transforms_to_usd() + @classmethod def sync_transforms_to_usd(cls) -> None: """Write Newton body_q to USD Fabric world matrices for Kit viewport / RTX rendering. - No-op when ``_usdrt_stage`` is None (i.e. Kit visualizer is not active). - Called by :class:`~isaaclab.sim.scene_data_providers.NewtonSceneDataProvider` at render - cadence (Kit), and after each physics step when using Newton+RTX so the renderer sees - updated poses. + No-op when ``_usdrt_stage`` is None (i.e. Kit visualizer is not active) + or when transforms have not changed since the last sync. + + Called at render cadence by :meth:`pre_render` (via + :meth:`~isaaclab.sim.SimulationContext.render`). + Physics stepping marks transforms dirty via :meth:`_mark_transforms_dirty` + so that the expensive Fabric hierarchy update only runs once per render + frame rather than after every physics step. Uses ``wp.fabricarray`` directly (no ``isaacsim.physics.newton`` extension needed). The Warp kernel reads ``state_0.body_q[newton_index[i]]`` and writes the corresponding ``mat44d`` to ``omni:fabric:worldMatrix`` for each prim. + + When cubric is available the method mirrors PhysX's ``DirectGpuHelper`` + pattern: pause Fabric change tracking, write transforms, resume tracking, + then call ``IAdapter::compute`` on the GPU to propagate the hierarchy and + notify the Fabric Scene Delegate. Otherwise it falls back to the CPU + ``update_world_xforms()`` path. """ if cls._usdrt_stage is None or cls._model is None or cls._state_0 is None: return + if not cls._transforms_dirty: + return try: import usdrt - selection = cls._usdrt_stage.SelectPrims( - require_attrs=[ - (usdrt.Sdf.ValueTypeNames.Matrix4d, "omni:fabric:worldMatrix", usdrt.Usd.Access.ReadWrite), - (usdrt.Sdf.ValueTypeNames.UInt, cls._newton_index_attr, usdrt.Usd.Access.Read), - ], - device=str(PhysicsManager._device), - ) - if selection.GetCount() == 0: - return - fabric_transforms = wp.fabricarray(selection, "omni:fabric:worldMatrix") - newton_indices = wp.fabricarray(selection, cls._newton_index_attr) - wp.launch( - _set_fabric_transforms, - dim=newton_indices.shape[0], - inputs=[fabric_transforms, newton_indices, cls._state_0.body_q], - device=PhysicsManager._device, - ) - wp.synchronize_device(PhysicsManager._device) + # Lazy adapter creation: deferred from initialize_solver() to avoid + # startup-ordering issues with the cubric plugin. + if cls._cubric is not None and cls._cubric.available and cls._cubric_adapter is None: + cls._cubric_adapter = cls._cubric.create_adapter() + if cls._cubric_adapter is not None: + logger.info("cubric GPU transform hierarchy enabled") + else: + logger.warning("cubric adapter creation failed; falling back to update_world_xforms()") + cls._cubric = None + + use_cubric = cls._cubric is not None and cls._cubric_adapter is not None + + fabric_hierarchy = None if hasattr(usdrt, "hierarchy"): fabric_hierarchy = usdrt.hierarchy.IFabricHierarchy().get_fabric_hierarchy( cls._usdrt_stage.GetFabricId(), cls._usdrt_stage.GetStageIdAsStageId() ) - fabric_hierarchy.update_world_xforms() - except Exception as exc: - logger.debug("[NewtonManager] sync_transforms_to_usd: %s", exc) + + # Pause hierarchy change tracking BEFORE SelectPrims. + # SelectPrims with ReadWrite access calls getAttributeArrayGpu + # internally, which marks Fabric buffers dirty. If tracking is + # still active at that point the hierarchy records the change and + # Kit's updateWorldXforms will do an expensive connectivity + # rebuild every frame. PhysX avoids this via ScopedUSDRT which + # pauses tracking before any Fabric writes. + if use_cubric and fabric_hierarchy is not None: + fabric_hierarchy.track_world_xform_changes(False) + fabric_hierarchy.track_local_xform_changes(False) + + try: + selection = cls._usdrt_stage.SelectPrims( + require_attrs=[ + (usdrt.Sdf.ValueTypeNames.Matrix4d, "omni:fabric:worldMatrix", usdrt.Usd.Access.ReadWrite), + (usdrt.Sdf.ValueTypeNames.UInt, cls._newton_index_attr, usdrt.Usd.Access.Read), + ], + device=str(PhysicsManager._device), + ) + if selection.GetCount() == 0: + return + + fabric_transforms = wp.fabricarray(selection, "omni:fabric:worldMatrix") + newton_indices = wp.fabricarray(selection, cls._newton_index_attr) + wp.launch( + _set_fabric_transforms, + dim=newton_indices.shape[0], + inputs=[fabric_transforms, newton_indices, cls._state_0.body_q], + device=PhysicsManager._device, + ) + wp.synchronize_device(PhysicsManager._device) + + cls._transforms_dirty = False + + if use_cubric and fabric_hierarchy is not None: + fabric_id = cls._usdrt_stage.GetFabricId().id + cls._cubric.bind_to_stage(cls._cubric_adapter, fabric_id) + cls._cubric.compute(cls._cubric_adapter) + elif fabric_hierarchy is not None: + fabric_hierarchy.update_world_xforms() + finally: + if use_cubric and fabric_hierarchy is not None: + fabric_hierarchy.track_world_xform_changes(True) + fabric_hierarchy.track_local_xform_changes(True) + except Exception: + logger.exception("[NewtonManager] sync_transforms_to_usd FAILED") + + @classmethod + def _mark_transforms_dirty(cls) -> None: + """Flag that physics state has changed and Fabric needs re-sync. + + Called by :meth:`_simulate` after stepping. The actual sync is deferred + to :meth:`sync_transforms_to_usd`, which runs at render cadence. + """ + cls._transforms_dirty = True @classmethod def step(cls) -> None: @@ -197,12 +280,26 @@ def step(cls) -> None: cls._solver.notify_model_changed(change) cls._model_changes = set() - # Step simulation (graphed or not; _graph is None when RTX/Fabric sync is active or on CPU) + # Lazy CUDA graph capture: deferred from initialize_solver() when RTX was active. + # By the time step() is first called, RTX has fully initialized (all cudaImportExternalMemory + # calls are done) and is idle between render frames — giving us a clean capture window. cfg = PhysicsManager._cfg - if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in PhysicsManager._device: # type: ignore[union-attr] + device = PhysicsManager._device + if cls._graph_capture_pending and cfg is not None and cfg.use_cuda_graph and "cuda" in device: # type: ignore[union-attr] + cls._graph_capture_pending = False + cls._graph = cls._capture_relaxed_graph(device) + if cls._graph is not None: + logger.info("Newton CUDA graph captured (deferred relaxed mode, RTX-compatible)") + else: + logger.warning("Newton deferred CUDA graph capture failed; using eager execution") + + # Step simulation (graphed or not; _graph is None when capture is disabled or failed) + if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in device: # type: ignore[union-attr] wp.capture_launch(cls._graph) + if cls._usdrt_stage is not None: + cls._mark_transforms_dirty() else: - with wp.ScopedDevice(PhysicsManager._device): + with wp.ScopedDevice(device): cls._simulate() # Debug convergence info @@ -239,6 +336,10 @@ def is_fabric_enabled(cls) -> bool: @classmethod def clear(cls): """Clear all Newton-specific state (callbacks cleared by super().close()).""" + if cls._cubric is not None and cls._cubric_adapter is not None: + cls._cubric.release_adapter(cls._cubric_adapter) + cls._cubric = None + cls._cubric_adapter = None cls._builder = None cls._model = None cls._solver = None @@ -252,8 +353,10 @@ def clear(cls): cls._newton_contact_sensors = {} cls._report_contacts = False cls._graph = None + cls._graph_capture_pending = False cls._newton_stage_path = None cls._usdrt_stage = None + cls._transforms_dirty = False cls._up_axis = "Z" cls._model_changes = set() cls._views = [] @@ -314,10 +417,15 @@ def start_simulation(cls) -> None: prim = cls._usdrt_stage.GetPrimAtPath(prim_path) prim.CreateAttribute(cls._newton_index_attr, usdrt.Sdf.ValueTypeNames.UInt, True) prim.GetAttribute(cls._newton_index_attr).Set(i) + # Tag with PhysicsRigidBodyAPI so cubric's eRigidBody mode + # applies Inverse propagation (preserves Newton's world + # transforms and derives local) instead of Forward. + prim.AddAppliedSchema("PhysicsRigidBodyAPI") xformable_prim = usdrt.Rt.Xformable(prim) if not xformable_prim.HasWorldXform(): xformable_prim.SetWorldXformFromUsd() + cls._mark_transforms_dirty() cls.sync_transforms_to_usd() @classmethod @@ -477,25 +585,169 @@ def initialize_solver(cls) -> None: # Initialize contacts and collision pipeline cls._initialize_contacts() + # Prepare cubric ctypes bindings (acquires IAdapter from carb framework). + # Adapter creation is deferred to the first sync_transforms_to_usd() call + # at render time to avoid any startup-ordering issues with the cubric + # plugin initialisation. + if cls._usdrt_stage is not None: + from isaaclab_newton.physics._cubric import CubricBindings + + cls._cubric = CubricBindings() + if cls._cubric.initialize(): + logger.info("cubric bindings ready (adapter deferred to first render)") + else: + logger.warning("cubric bindings init failed; falling back to update_world_xforms()") + cls._cubric = None + device = PhysicsManager._device - # Skip CUDA graph when syncing to USD/Fabric for RTX: capture conflicts with RTX/Replicator - # using the legacy stream (cudaErrorStreamCaptureImplicit). - use_cuda_graph = cfg.use_cuda_graph and (cls._usdrt_stage is None) # type: ignore[union-attr] + use_cuda_graph = cfg.use_cuda_graph and "cuda" in device # type: ignore[union-attr] with Timer(name="newton_cuda_graph", msg="CUDA graph took:"): - if use_cuda_graph and "cuda" in device: - with wp.ScopedCapture() as capture: - cls._simulate() - cls._graph = capture.graph + if use_cuda_graph: + if cls._usdrt_stage is None: + # No RTX active — use standard Warp capture (cudaStreamCaptureModeGlobal). + with wp.ScopedCapture() as capture: + cls._simulate() + cls._graph = capture.graph + logger.info("Newton CUDA graph captured (standard Warp mode)") + else: + # RTX is active during initialization — cudaImportExternalMemory and other + # non-capturable RTX ops run on background CUDA streams right now. + # Defer capture to the first step() call, after RTX is fully initialized + # and idle between render frames (clean capture window). + cls._graph = None + cls._graph_capture_pending = True + logger.info("Newton CUDA graph capture deferred until first step() (RTX active)") else: cls._graph = None @classmethod - def _simulate(cls) -> None: - """Run one simulation step with substeps.""" + def _capture_relaxed_graph(cls, device: str): + """Capture Newton physics (only) as a CUDA graph, RTX-compatible. + + Uses a hybrid approach to work around two conflicting requirements: + + 1. RTX background threads use CUDA's legacy stream (stream 0) for async operations + like ``cudaImportExternalMemory``. A standard ``wp.ScopedCapture()`` uses + ``cudaStreamCaptureModeThreadLocal`` on Warp's default stream (a blocking stream). + A blocking stream synchronises implicitly with legacy stream 0, so RTX ops inside + the capture window fail with error 906. + + 2. ``mujoco_warp`` calls ``wp.capture_while`` inside ``solver.solve()``. + ``wp.capture_while`` checks ``device.captures`` (populated by ``wp.capture_begin``) + to decide whether to insert a conditional graph node (graph-capture path) or to run + eagerly with ``wp.synchronize_stream`` (non-capture path). Without an entry in + ``device.captures``, it synchronises the capturing stream — which raises "Cannot + synchronize stream while graph capture is active". + + Solution: + + - Create a **non-blocking** stream (``cudaStreamNonBlocking = 0x01``): no implicit sync + with legacy stream 0, so RTX background threads are unaffected (avoids error 906). + - Start the capture externally via ``cudaStreamBeginCapture`` with + ``cudaStreamCaptureModeRelaxed`` so no other CUDA activity is disrupted. + - Call ``wp.capture_begin(external=True, stream=fresh_stream)``: + this registers the capture in Warp's ``device.captures`` *without* calling + ``cudaStreamBeginCapture`` (already done) and *without* changing device-wide memory + pool attributes (avoids error 900 in RTX's ``cudaMallocAsync``). + - Run ``_simulate_physics_only()`` inside ``ScopedStream(fresh_stream)``: + kernels dispatch to ``fresh_stream`` and are captured; ``wp.capture_while`` finds the + active capture and inserts a conditional graph node instead of synchronising. + - Call ``wp.capture_end(stream=fresh_stream)`` to finalise the Warp-level capture. + - Call ``cudaStreamEndCapture`` to close the CUDA stream capture and get the graph. + + Warmup run pre-allocates all MuJoCo-Warp scratch buffers so no ``cudaMalloc`` occurs during + capture. ``sync_transforms_to_usd`` (which calls ``wp.synchronize_device``) is + excluded from the capture and runs eagerly in ``step()`` after ``wp.capture_launch``. + + Returns a ``wp.Graph`` on success, or ``None`` on failure. + """ + if _cudart is None: + logger.warning("libcudart not available; cannot use relaxed graph capture") + return None + + # Warmup: pre-allocate all MuJoCo-Warp scratch buffers so the capture window has + # no new cudaMalloc calls (which are forbidden inside graph capture). + with wp.ScopedDevice(device): + cls._simulate_physics_only() + wp.synchronize_stream(wp.get_stream(device)) + + # Create a non-blocking stream (cudaStreamNonBlocking = 0x01). + raw_handle = ctypes.c_void_p() + ret = _cudart.cudaStreamCreateWithFlags(ctypes.byref(raw_handle), ctypes.c_uint(0x01)) + if ret != 0: + logger.warning("cudaStreamCreateWithFlags(NonBlocking) failed (code %d)", ret) + return None + fresh_handle = raw_handle.value + fresh_stream = wp.Stream(device, cuda_stream=fresh_handle, owner=False) + + # Start capture in relaxed mode BEFORE entering ScopedStream. + ret = _cudart.cudaStreamBeginCapture(ctypes.c_void_p(fresh_handle), ctypes.c_int(2)) + if ret != 0: + _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle)) + logger.warning("cudaStreamBeginCapture(relaxed) failed (code %d)", ret) + return None + + try: + wp.capture_begin(stream=fresh_stream, external=True) + except Exception as exc: + raw_graph = ctypes.c_void_p() + _cudart.cudaStreamEndCapture(ctypes.c_void_p(fresh_handle), ctypes.byref(raw_graph)) + if raw_graph.value: + _cudart.cudaGraphDestroy(raw_graph) + _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle)) + logger.warning("wp.capture_begin(external=True) failed: %s", exc) + return None + + err_during_capture = None + with wp.ScopedStream(fresh_stream, sync_enter=False): + try: + cls._simulate_physics_only() + except Exception as exc: + err_during_capture = exc + + if err_during_capture is None: + try: + graph = wp.capture_end(stream=fresh_stream) + except Exception as exc: + err_during_capture = exc + graph = None + else: + try: + wp.capture_end(stream=fresh_stream) + except Exception: + pass + graph = None + + raw_graph = ctypes.c_void_p() + end_ret = _cudart.cudaStreamEndCapture(ctypes.c_void_p(fresh_handle), ctypes.byref(raw_graph)) + _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle)) + + if err_during_capture is not None: + if raw_graph.value: + _cudart.cudaGraphDestroy(raw_graph) + logger.warning("Newton graph capture aborted during simulate: %s", err_during_capture) + return None + + if end_ret != 0 or not raw_graph.value: + logger.warning("cudaStreamEndCapture failed (code %d)", end_ret) + return None - # MJWarp can use its internal collision pipeline. + graph.graph = raw_graph + graph.graph_exec = None + return graph + + @classmethod + def _simulate_physics_only(cls) -> None: + """Run one physics step without Fabric/USD sync — safe for CUDA graph capture. + + Used by :meth:`_capture_relaxed_graph` to capture only the pure physics kernels. + ``sync_transforms_to_usd`` is excluded because it calls ``wp.synchronize_device`` + (forbidden inside graph capture) and ``wp.fabricarray`` (device-wide allocation). + The caller (``step()``) is responsible for calling ``sync_transforms_to_usd()`` + eagerly after ``wp.capture_launch``. + """ if cls._needs_collision_pipeline: cls._collision_pipeline.collide(cls._state_0, cls._contacts) contacts = cls._contacts @@ -512,7 +764,6 @@ def step_fn(state_0, state_1): else: cfg = PhysicsManager._cfg need_copy_on_last_substep = (cfg is not None and cfg.use_cuda_graph) and cls._num_substeps % 2 == 1 # type: ignore[union-attr] - for i in range(cls._num_substeps): step_fn(cls._state_0, cls._state_1) if need_copy_on_last_substep and i == cls._num_substeps - 1: @@ -521,18 +772,23 @@ def step_fn(state_0, state_1): cls._state_0, cls._state_1 = cls._state_1, cls._state_0 cls._state_0.clear_forces() - # Populate contacts for contact sensors if cls._report_contacts: - # For newton_contacts (unified pipeline): use locally computed contacts - # For mujoco_contacts: use class-level _contacts, solver populates it from MuJoCo data eval_contacts = contacts if contacts is not None else cls._contacts cls._solver.update_contacts(eval_contacts, cls._state_0) for sensor in cls._newton_contact_sensors.values(): sensor.update(cls._state_0, eval_contacts) - # Sync Newton state to USD/Fabric for RTX rendering (e.g., Newton Physics + RTX Renderer preset) + @classmethod + def _simulate(cls) -> None: + """Run one simulation step with substeps and USD sync. + + Delegates physics work to :meth:`_simulate_physics_only` and then + marks transforms dirty for the next render-cadence sync. + """ + cls._simulate_physics_only() + if cls._usdrt_stage is not None: - cls.sync_transforms_to_usd() + cls._mark_transforms_dirty() @classmethod def get_solver_convergence_steps(cls) -> dict[str, float | int]: