From c3194ba0ffb574f23a667de5626dc2e4526c18cc Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Wed, 11 Mar 2026 14:34:40 -0700
Subject: [PATCH 1/8] Lazy update fabric xforms.

---
 .../isaaclab_newton/physics/newton_manager.py | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
index a8f7d1ba786c..324a16db627e 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
@@ -89,6 +89,7 @@ class NewtonManager(PhysicsManager):
     _usdrt_stage = None
     _newton_index_attr = "newton:index"
     _clone_physics_only = False
+    _transforms_dirty: bool = False
 
     # Model changes (callbacks use unified system from PhysicsManager)
     _model_changes: set[int] = set()
@@ -143,10 +144,14 @@ def forward(cls) -> None:
     def sync_transforms_to_usd(cls) -> None:
         """Write Newton body_q to USD Fabric world matrices for Kit viewport / RTX rendering.
 
-        No-op when ``_usdrt_stage`` is None (i.e. Kit visualizer is not active).
-        Called by :class:`~isaaclab.sim.scene_data_providers.NewtonSceneDataProvider` at render
-        cadence (Kit), and after each physics step when using Newton+RTX so the renderer sees
-        updated poses.
+        No-op when ``_usdrt_stage`` is None (i.e. Kit visualizer is not active)
+        or when transforms have not changed since the last sync.
+
+        This method is called at render cadence by
+        :class:`~isaaclab.sim.scene_data_providers.NewtonSceneDataProvider`.
+        Physics stepping marks transforms dirty via :meth:`_mark_transforms_dirty`
+        so that the expensive Fabric hierarchy update only runs once per render
+        frame rather than after every physics step.
 
         Uses ``wp.fabricarray`` directly (no ``isaacsim.physics.newton`` extension needed).
         The Warp kernel reads ``state_0.body_q[newton_index[i]]`` and writes the
@@ -154,6 +159,8 @@ def sync_transforms_to_usd(cls) -> None:
         """
         if cls._usdrt_stage is None or cls._model is None or cls._state_0 is None:
             return
+        if not cls._transforms_dirty:
+            return
         try:
             import usdrt
 
@@ -180,9 +187,19 @@ def sync_transforms_to_usd(cls) -> None:
                     cls._usdrt_stage.GetFabricId(), cls._usdrt_stage.GetStageIdAsStageId()
                 )
                 fabric_hierarchy.update_world_xforms()
+            cls._transforms_dirty = False
         except Exception as exc:
             logger.debug("[NewtonManager] sync_transforms_to_usd: %s", exc)
 
+    @classmethod
+    def _mark_transforms_dirty(cls) -> None:
+        """Flag that physics state has changed and Fabric needs re-sync.
+
+        Called by :meth:`_simulate` after stepping. The actual sync is deferred
+        to :meth:`sync_transforms_to_usd`, which runs at render cadence.
+        """
+        cls._transforms_dirty = True
+
     @classmethod
     def step(cls) -> None:
         """Step the physics simulation."""
@@ -254,6 +271,7 @@ def clear(cls):
         cls._graph = None
         cls._newton_stage_path = None
         cls._usdrt_stage = None
+        cls._transforms_dirty = False
         cls._up_axis = "Z"
         cls._model_changes = set()
         cls._views = []
@@ -318,6 +336,7 @@ def start_simulation(cls) -> None:
                 if not xformable_prim.HasWorldXform():
                     xformable_prim.SetWorldXformFromUsd()
 
+            cls._mark_transforms_dirty()
             cls.sync_transforms_to_usd()
 
     @classmethod
@@ -530,9 +549,9 @@ def step_fn(state_0, state_1):
             for sensor in cls._newton_contact_sensors.values():
                 sensor.update(cls._state_0, eval_contacts)
 
-        # Sync Newton state to USD/Fabric for RTX rendering (e.g., Newton Physics + RTX Renderer preset)
+        # Mark transforms dirty so the next render-cadence sync picks them up.
         if cls._usdrt_stage is not None:
-            cls.sync_transforms_to_usd()
+            cls._mark_transforms_dirty()
 
     @classmethod
     def get_solver_convergence_steps(cls) -> dict[str, float | int]:

From 181be179811195037b47f24d7822e053a3e330fa Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Wed, 11 Mar 2026 16:10:56 -0700
Subject: [PATCH 2/8] cuda graph fix

---
 .../isaaclab_newton/physics/newton_manager.py | 215 +++++++++++++++++-
 1 file changed, 205 insertions(+), 10 deletions(-)

diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
index 324a16db627e..fecb8d125fd6 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
@@ -7,12 +7,24 @@
 
 from __future__ import annotations
 
+import ctypes
 import inspect
 import logging
 from typing import TYPE_CHECKING
 
 import numpy as np
 import warp as wp
+
+# Load CUDA runtime for relaxed-mode graph capture (RTX-compatible).
+# cudaStreamCaptureModeRelaxed (2) allows the RTX compositor's background
+# CUDA stream to keep running during capture without invalidating it.
+try:
+    _cudart = ctypes.CDLL("libcudart.so.12")
+except OSError:
+    try:
+        _cudart = ctypes.CDLL("libcudart.so")
+    except OSError:
+        _cudart = None
 from newton import Axis, CollisionPipeline, Contacts, Control, Model, ModelBuilder, State, eval_fk
 from newton._src.usd.schemas import SchemaResolverNewton, SchemaResolverPhysx
 from newton.sensors import SensorContact as NewtonContactSensor
@@ -83,6 +95,7 @@ class NewtonManager(PhysicsManager):
 
     # CUDA graphing
     _graph = None
+    _graph_capture_pending: bool = False
 
     # USD/Fabric sync
     _newton_stage_path = None
@@ -214,12 +227,28 @@ def step(cls) -> None:
                     cls._solver.notify_model_changed(change)
                 cls._model_changes = set()
 
-        # Step simulation (graphed or not; _graph is None when RTX/Fabric sync is active or on CPU)
+        # Lazy CUDA graph capture: deferred from initialize_solver() when RTX was active.
+        # By the time step() is first called, RTX has fully initialized (all cudaImportExternalMemory
+        # calls are done) and is idle between render frames — giving us a clean capture window.
         cfg = PhysicsManager._cfg
-        if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in PhysicsManager._device:  # type: ignore[union-attr]
+        device = PhysicsManager._device
+        if cls._graph_capture_pending and cfg is not None and cfg.use_cuda_graph and "cuda" in device:  # type: ignore[union-attr]
+            cls._graph_capture_pending = False
+            cls._graph = cls._capture_relaxed_graph(device)
+            if cls._graph is not None:
+                logger.info("Newton CUDA graph captured (deferred relaxed mode, RTX-compatible)")
+            else:
+                logger.warning("Newton deferred CUDA graph capture failed; using eager execution")
+
+        # Step simulation (graphed or not; _graph is None when capture is disabled or failed)
+        if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in device:  # type: ignore[union-attr]
             wp.capture_launch(cls._graph)
+            # Fabric sync is excluded from the graph (wp.synchronize_device is non-capturable),
+            # so run it eagerly here after the graph replay.
+            if cls._usdrt_stage is not None:
+                cls.sync_transforms_to_usd()
         else:
-            with wp.ScopedDevice(PhysicsManager._device):
+            with wp.ScopedDevice(device):
                 cls._simulate()
 
         # Debug convergence info
@@ -269,6 +298,7 @@ def clear(cls):
         cls._newton_contact_sensors = {}
         cls._report_contacts = False
         cls._graph = None
+        cls._graph_capture_pending = False
         cls._newton_stage_path = None
         cls._usdrt_stage = None
         cls._transforms_dirty = False
@@ -498,18 +528,183 @@ def initialize_solver(cls) -> None:
 
         device = PhysicsManager._device
 
-        # Skip CUDA graph when syncing to USD/Fabric for RTX: capture conflicts with RTX/Replicator
-        # using the legacy stream (cudaErrorStreamCaptureImplicit).
-        use_cuda_graph = cfg.use_cuda_graph and (cls._usdrt_stage is None)  # type: ignore[union-attr]
+        use_cuda_graph = cfg.use_cuda_graph and "cuda" in device  # type: ignore[union-attr]
 
         with Timer(name="newton_cuda_graph", msg="CUDA graph took:"):
-            if use_cuda_graph and "cuda" in device:
-                with wp.ScopedCapture() as capture:
-                    cls._simulate()
-                cls._graph = capture.graph
+            if use_cuda_graph:
+                if cls._usdrt_stage is None:
+                    # No RTX active — use standard Warp capture (cudaStreamCaptureModeGlobal).
+                    with wp.ScopedCapture() as capture:
+                        cls._simulate()
+                    cls._graph = capture.graph
+                    logger.info("Newton CUDA graph captured (standard Warp mode)")
+                else:
+                    # RTX is active during initialization — cudaImportExternalMemory and other
+                    # non-capturable RTX ops run on background CUDA streams right now.
+                    # Defer capture to the first step() call, after RTX is fully initialized
+                    # and idle between render frames (clean capture window).
+                    cls._graph = None
+                    cls._graph_capture_pending = True
+                    logger.info("Newton CUDA graph capture deferred until first step() (RTX active)")
             else:
                 cls._graph = None
 
+    @classmethod
+    def _capture_relaxed_graph(cls, device: str):
+        """Capture Newton physics (only) as a CUDA graph, RTX-compatible.
+
+        Uses a hybrid approach to work around two conflicting requirements:
+
+        1. RTX background threads use CUDA's legacy stream (stream 0) for async operations
+           like ``cudaImportExternalMemory``.  A standard ``wp.ScopedCapture()`` uses
+           ``cudaStreamCaptureModeThreadLocal`` on Warp's default stream (a blocking stream).
+           A blocking stream synchronises implicitly with legacy stream 0, so RTX ops inside
+           the capture window fail with error 906.
+
+        2. ``mujoco_warp`` calls ``wp.capture_while`` inside ``solver.solve()``.
+           ``wp.capture_while`` checks ``device.captures`` (populated by ``wp.capture_begin``)
+           to decide whether to insert a conditional graph node (graph-capture path) or to run
+           eagerly with ``wp.synchronize_stream`` (non-capture path).  Without an entry in
+           ``device.captures``, it synchronises the capturing stream — which raises "Cannot
+           synchronize stream while graph capture is active".
+
+        Solution:
+
+        - Create a **non-blocking** stream (``cudaStreamNonBlocking = 0x01``): no implicit sync
+          with legacy stream 0, so RTX background threads are unaffected (avoids error 906).
+        - Start the capture externally via ``cudaStreamBeginCapture`` with
+          ``cudaStreamCaptureModeRelaxed`` so no other CUDA activity is disrupted.
+        - Call ``wp.capture_begin(external=True, stream=fresh_stream)``:
+          this registers the capture in Warp's ``device.captures`` *without* calling
+          ``cudaStreamBeginCapture`` (already done) and *without* changing device-wide memory
+          pool attributes (avoids error 900 in RTX's ``cudaMallocAsync``).
+        - Run ``_simulate_physics_only()`` inside ``ScopedStream(fresh_stream)``:
+          kernels dispatch to ``fresh_stream`` and are captured; ``wp.capture_while`` finds the
+          active capture and inserts a conditional graph node instead of synchronising.
+        - Call ``wp.capture_end(stream=fresh_stream)`` to finalise the Warp-level capture.
+        - Call ``cudaStreamEndCapture`` to close the CUDA stream capture and get the graph.
+
+        Warmup run pre-allocates all MuJoCo-Warp scratch buffers so no ``cudaMalloc`` occurs during
+        capture.  ``sync_transforms_to_usd`` (which calls ``wp.synchronize_device``) is
+        excluded from the capture and runs eagerly in ``step()`` after ``wp.capture_launch``.
+
+        Returns a ``wp.Graph`` on success, or ``None`` on failure.
+        """
+        if _cudart is None:
+            logger.warning("libcudart not available; cannot use relaxed graph capture")
+            return None
+
+        # Warmup: pre-allocate all MuJoCo-Warp scratch buffers so the capture window has
+        # no new cudaMalloc calls (which are forbidden inside graph capture).
+        with wp.ScopedDevice(device):
+            cls._simulate_physics_only()
+        wp.synchronize_stream(wp.get_stream(device))
+
+        # Create a non-blocking stream (cudaStreamNonBlocking = 0x01).
+        raw_handle = ctypes.c_void_p()
+        ret = _cudart.cudaStreamCreateWithFlags(ctypes.byref(raw_handle), ctypes.c_uint(0x01))
+        if ret != 0:
+            logger.warning("cudaStreamCreateWithFlags(NonBlocking) failed (code %d)", ret)
+            return None
+        fresh_handle = raw_handle.value
+        fresh_stream = wp.Stream(device, cuda_stream=fresh_handle, owner=False)
+
+        # Start capture in relaxed mode BEFORE entering ScopedStream.
+        ret = _cudart.cudaStreamBeginCapture(ctypes.c_void_p(fresh_handle), ctypes.c_int(2))
+        if ret != 0:
+            _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle))
+            logger.warning("cudaStreamBeginCapture(relaxed) failed (code %d)", ret)
+            return None
+
+        try:
+            wp.capture_begin(stream=fresh_stream, external=True)
+        except Exception as exc:
+            raw_graph = ctypes.c_void_p()
+            _cudart.cudaStreamEndCapture(ctypes.c_void_p(fresh_handle), ctypes.byref(raw_graph))
+            if raw_graph.value:
+                _cudart.cudaGraphDestroy(raw_graph)
+            _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle))
+            logger.warning("wp.capture_begin(external=True) failed: %s", exc)
+            return None
+
+        err_during_capture = None
+        with wp.ScopedStream(fresh_stream, sync_enter=False):
+            try:
+                cls._simulate_physics_only()
+            except Exception as exc:
+                err_during_capture = exc
+
+        if err_during_capture is None:
+            try:
+                graph = wp.capture_end(stream=fresh_stream)
+            except Exception as exc:
+                err_during_capture = exc
+                graph = None
+        else:
+            try:
+                wp.capture_end(stream=fresh_stream)
+            except Exception:
+                pass
+            graph = None
+
+        raw_graph = ctypes.c_void_p()
+        end_ret = _cudart.cudaStreamEndCapture(ctypes.c_void_p(fresh_handle), ctypes.byref(raw_graph))
+        _cudart.cudaStreamDestroy(ctypes.c_void_p(fresh_handle))
+
+        if err_during_capture is not None:
+            if raw_graph.value:
+                _cudart.cudaGraphDestroy(raw_graph)
+            logger.warning("Newton graph capture aborted during simulate: %s", err_during_capture)
+            return None
+
+        if end_ret != 0 or not raw_graph.value:
+            logger.warning("cudaStreamEndCapture failed (code %d)", end_ret)
+            return None
+
+        graph.graph = raw_graph
+        graph.graph_exec = None
+        return graph
+
+    @classmethod
+    def _simulate_physics_only(cls) -> None:
+        """Run one physics step without Fabric/USD sync — safe for CUDA graph capture.
+
+        Used by :meth:`_capture_relaxed_graph` to capture only the pure physics kernels.
+        ``sync_transforms_to_usd`` is excluded because it calls ``wp.synchronize_device``
+        (forbidden inside graph capture) and ``wp.fabricarray`` (device-wide allocation).
+        The caller (``step()``) is responsible for calling ``sync_transforms_to_usd()``
+        eagerly after ``wp.capture_launch``.
+        """
+        if cls._needs_collision_pipeline:
+            cls._collision_pipeline.collide(cls._state_0, cls._contacts)
+            contacts = cls._contacts
+        else:
+            contacts = None
+
+        def step_fn(state_0, state_1):
+            cls._solver.step(state_0, state_1, cls._control, contacts, cls._solver_dt)
+
+        if cls._use_single_state:
+            for i in range(cls._num_substeps):
+                step_fn(cls._state_0, cls._state_0)
+                cls._state_0.clear_forces()
+        else:
+            cfg = PhysicsManager._cfg
+            need_copy_on_last_substep = (cfg is not None and cfg.use_cuda_graph) and cls._num_substeps % 2 == 1  # type: ignore[union-attr]
+            for i in range(cls._num_substeps):
+                step_fn(cls._state_0, cls._state_1)
+                if need_copy_on_last_substep and i == cls._num_substeps - 1:
+                    cls._state_0.assign(cls._state_1)
+                else:
+                    cls._state_0, cls._state_1 = cls._state_1, cls._state_0
+                cls._state_0.clear_forces()
+
+        if cls._report_contacts:
+            eval_contacts = contacts if contacts is not None else cls._contacts
+            cls._solver.update_contacts(eval_contacts, cls._state_0)
+            for sensor in cls._newton_contact_sensors.values():
+                sensor.update(cls._state_0, eval_contacts)
+
     @classmethod
     def _simulate(cls) -> None:
         """Run one simulation step with substeps."""

From a3d8e00a2365d39dde025c55f28629292773d046 Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Wed, 11 Mar 2026 16:11:34 -0700
Subject: [PATCH 3/8] write images

---
 .../renderers/isaac_rtx_renderer.py           | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
index 22b07f13def0..ffba9463d86a 100644
--- a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
+++ b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
@@ -8,7 +8,9 @@
 from __future__ import annotations
 
 import json
+import logging
 import math
+import os
 import weakref
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
@@ -48,6 +50,12 @@ class IsaacRtxRenderData:
     sensor: SensorBase | None = None
 
 
+_logger = logging.getLogger(__name__)
+
+_TILED_DUMP_DIR = "/tmp/il-rtx"
+_tiled_frame_counter = 0
+
+
 class IsaacRtxRenderer(BaseRenderer):
     """Isaac RTX backend using Omniverse Replicator for tiled camera rendering.
 
@@ -248,6 +256,10 @@ def tiling_grid_shape():
             if data_type in SIMPLE_SHADING_MODES:
                 tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
 
+            # Dump the tiled image to disk (first visual data type per frame only)
+            if tiled_data_buffer.dtype == wp.uint8:
+                self._save_tiled_image(tiled_data_buffer)
+
             wp.launch(
                 kernel=reshape_tiled_image,
                 dim=(view_count, cfg.height, cfg.width),
@@ -280,6 +292,20 @@ def tiling_grid_shape():
                     0.0 if cfg.depth_clipping_behavior == "zero" else cfg.spawn.clipping_range[1]
                 )
 
+    def _save_tiled_image(self, tiled_data: wp.array):
+        """Save the raw tiled render product to disk as a PNG."""
+        global _tiled_frame_counter
+        os.makedirs(_TILED_DUMP_DIR, exist_ok=True)
+        path = os.path.join(_TILED_DUMP_DIR, f"frame_{_tiled_frame_counter:06d}.png")
+        _tiled_frame_counter += 1
+        try:
+            from PIL import Image
+
+            img_np = wp.to_torch(tiled_data).cpu().numpy()
+            Image.fromarray(img_np).save(path)
+        except Exception:
+            _logger.warning("Failed to save tiled image to %s", path, exc_info=True)
+
     def write_output(self, render_data: IsaacRtxRenderData, output_name: str, output_data: torch.Tensor):
         """No-op for Isaac RTX - all outputs written in render().
         See :meth:`~isaaclab.renderers.base_renderer.BaseRenderer.write_output`."""

From 5c0145aa50219af8f613b5df9afe73763f7d77fe Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Thu, 12 Mar 2026 10:22:59 -0700
Subject: [PATCH 4/8] Works.

---
 source/isaaclab/isaaclab/physics/physics_manager.py | 11 +++++++++++
 source/isaaclab/isaaclab/sim/simulation_context.py  |  1 +
 .../isaaclab_newton/physics/newton_manager.py       | 13 ++++++++-----
 .../isaaclab_physx/renderers/isaac_rtx_renderer.py  |  4 ++--
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/source/isaaclab/isaaclab/physics/physics_manager.py b/source/isaaclab/isaaclab/physics/physics_manager.py
index 5b94ede481d5..5676fa8ef02b 100644
--- a/source/isaaclab/isaaclab/physics/physics_manager.py
+++ b/source/isaaclab/isaaclab/physics/physics_manager.py
@@ -265,6 +265,17 @@ def step(cls) -> None:
         """Step physics simulation by one timestep (physics only, no rendering)."""
         pass
 
+    @classmethod
+    def pre_render(cls) -> None:
+        """Sync deferred physics state to the rendering backend.
+
+        Called by :meth:`~isaaclab.sim.SimulationContext.render` before cameras
+        and visualizers read scene data. The default implementation is a no-op.
+        Backends that defer transform writes (e.g. Newton's dirty-flag pattern)
+        should override this to flush pending updates.
+        """
+        pass
+
     @classmethod
     def close(cls) -> None:
         """Clean up physics resources.
diff --git a/source/isaaclab/isaaclab/sim/simulation_context.py b/source/isaaclab/isaaclab/sim/simulation_context.py
index 1d8551137f93..bb1daf6b0e5d 100644
--- a/source/isaaclab/isaaclab/sim/simulation_context.py
+++ b/source/isaaclab/isaaclab/sim/simulation_context.py
@@ -620,6 +620,7 @@ def render(self, mode: int | None = None) -> None:
         every physics step). Camera sensors drive their configured renderer when
         fetching data, so this method remains backend-agnostic.
         """
+        self.physics_manager.pre_render()
         self.update_visualizers(self.get_rendering_dt())
 
         # Call render callbacks
diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
index fecb8d125fd6..cd9be10c49a1 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
@@ -153,6 +153,11 @@ def forward(cls) -> None:
         """Update articulation kinematics without stepping physics."""
         eval_fk(cls._model, cls._state_0.joint_q, cls._state_0.joint_qd, cls._state_0, None)
 
+    @classmethod
+    def pre_render(cls) -> None:
+        """Flush deferred Fabric writes before cameras/visualizers read the scene."""
+        cls.sync_transforms_to_usd()
+
     @classmethod
     def sync_transforms_to_usd(cls) -> None:
         """Write Newton body_q to USD Fabric world matrices for Kit viewport / RTX rendering.
@@ -160,8 +165,8 @@ def sync_transforms_to_usd(cls) -> None:
         No-op when ``_usdrt_stage`` is None (i.e. Kit visualizer is not active)
         or when transforms have not changed since the last sync.
 
-        This method is called at render cadence by
-        :class:`~isaaclab.sim.scene_data_providers.NewtonSceneDataProvider`.
+        Called at render cadence by :meth:`pre_render` (via
+        :meth:`~isaaclab.sim.SimulationContext.render`).
         Physics stepping marks transforms dirty via :meth:`_mark_transforms_dirty`
         so that the expensive Fabric hierarchy update only runs once per render
         frame rather than after every physics step.
@@ -243,10 +248,8 @@ def step(cls) -> None:
         # Step simulation (graphed or not; _graph is None when capture is disabled or failed)
         if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in device:  # type: ignore[union-attr]
             wp.capture_launch(cls._graph)
-            # Fabric sync is excluded from the graph (wp.synchronize_device is non-capturable),
-            # so run it eagerly here after the graph replay.
             if cls._usdrt_stage is not None:
-                cls.sync_transforms_to_usd()
+                cls._mark_transforms_dirty()
         else:
             with wp.ScopedDevice(device):
                 cls._simulate()
diff --git a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
index ffba9463d86a..143ca6643e91 100644
--- a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
+++ b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
@@ -257,8 +257,8 @@ def tiling_grid_shape():
                 tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
 
             # Dump the tiled image to disk (first visual data type per frame only)
-            if tiled_data_buffer.dtype == wp.uint8:
-                self._save_tiled_image(tiled_data_buffer)
+            #if tiled_data_buffer.dtype == wp.uint8:
+            #    self._save_tiled_image(tiled_data_buffer)
 
             wp.launch(
                 kernel=reshape_tiled_image,

From c5e109d344019ae425b0a03c7511a90ab44c7203 Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Thu, 12 Mar 2026 19:22:25 -0700
Subject: [PATCH 5/8] Cubric bindings

---
 .../isaaclab_newton/physics/_cubric.py        | 345 ++++++++++++++++++
 .../isaaclab_newton/physics/newton_manager.py | 114 ++++--
 2 files changed, 437 insertions(+), 22 deletions(-)
 create mode 100644 source/isaaclab_newton/isaaclab_newton/physics/_cubric.py

diff --git a/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py
new file mode 100644
index 000000000000..68453ee77a93
--- /dev/null
+++ b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Pure-Python ctypes bindings for the cubric GPU transform-hierarchy API.
+
+Acquires the ``omni::cubric::IAdapter`` carb interface directly from the
+Carbonite framework and wraps its function-pointer methods so that Newton
+can call cubric's GPU transform propagation without C++ pybind11 changes.
+
+The flow mirrors PhysX's ``DirectGpuHelper::updateXForms_GPU()``:
+
+1. ``IAdapter::create``     → allocate a cubric adapter ID
+2. ``IAdapter::bindToStage`` → bind to the current Fabric stage
+3. ``IAdapter::compute``     → GPU kernel: propagate world transforms
+4. ``IAdapter::release``     → free the adapter
+
+When cubric is unavailable (e.g. CPU-only machine, plugin not loaded), the
+caller falls back to the CPU ``update_world_xforms()`` path.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import logging
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+#  Carb Framework struct layout (CARB_ABI function-pointer offsets, x86_64)
+# ---------------------------------------------------------------------------
+# Counting only CARB_ABI fields from the top of ``struct Framework``:
+#   0: loadPluginsEx
+#   8: unloadAllPlugins
+#  16: acquireInterfaceWithClient
+#  24: tryAcquireInterfaceWithClient  ← we use this one
+_FW_OFF_TRY_ACQUIRE = 24
+
+# ---------------------------------------------------------------------------
+#  IAdapter struct layout  (from omni/cubric/IAdapter.h)
+# ---------------------------------------------------------------------------
+#   0: getAttribute
+#   8: create(AdapterId*)
+#  16: refcount
+#  24: retain
+#  32: release(AdapterId)
+#  40: bindToStage(AdapterId, const FabricId&)
+#  48: unbind
+#  56: compute(AdapterId, options, dirtyMode, outFlags*)
+_IA_OFF_CREATE = 8
+_IA_OFF_RELEASE = 32
+_IA_OFF_BIND = 40
+_IA_OFF_COMPUTE = 56
+
+# AdapterId sentinel
+_INVALID_ADAPTER_ID = ctypes.c_uint64(~0).value
+
+# AdapterComputeOptions flags  (from IAdapter.h)
+_OPT_FORCE_UPDATE = 1 << 0              # Force update, ignoring invalidation status
+_OPT_FORCE_STATE_RECONSTRUCTION = 1 << 1  # Force full rebuild of internal accel structures
+_OPT_SKIP_ISOLATED = 1 << 2             # Skip prims with connectivity degree 0
+_OPT_RIGID_BODY = 1 << 3                # Use PhysicsRigidBodyAPI tag for inverse propagation
+
+# Newton prims get tagged with PhysicsRigidBodyAPI at init time so
+# cubric's eRigidBody mode can distinguish rigid-body buckets
+# (Inverse: preserve world matrix written by Newton, derive local)
+# from non-rigid-body buckets (Forward: propagate to children).
+# eForceUpdate is ORed in to bypass the change-listener check.
+_OPT_DEFAULT = _OPT_RIGID_BODY | _OPT_FORCE_UPDATE
+
+# AdapterDirtyMode
+_DIRTY_ALL = 0     # eAll — dirty all prims in the stage
+_DIRTY_COARSE = 1  # eCoarse — dirty all prims in visited buckets
+
+
+# ---------------------------------------------------------------------------
+#  ctypes struct mirrors
+# ---------------------------------------------------------------------------
+class _Version(ctypes.Structure):
+    _fields_ = [("major", ctypes.c_uint32), ("minor", ctypes.c_uint32)]
+
+
+class _InterfaceDesc(ctypes.Structure):
+    """``carb::InterfaceDesc`` — {const char* name, Version version}."""
+    _fields_ = [
+        ("name", ctypes.c_char_p),
+        ("version", _Version),
+    ]
+
+
+def _read_u64(addr: int) -> int:
+    return ctypes.c_uint64.from_address(addr).value
+
+
+def _dump_fn_ptrs(base: int, names: list[str], label: str) -> None:
+    """Log function pointer values at 8-byte intervals from *base*."""
+    for i, name in enumerate(names):
+        addr = _read_u64(base + i * 8)
+        tag = "OK" if addr else "NULL"
+        logger.info("  %s+%d (%s) = 0x%016x  [%s]", label, i * 8, name, addr, tag)
+
+
+# ---------------------------------------------------------------------------
+#  Public API
+# ---------------------------------------------------------------------------
+class CubricBindings:
+    """Typed wrappers around the cubric ``IAdapter`` API.
+
+    Call :meth:`initialize` once; if it returns ``True``, the four adapter
+    methods are available.
+    """
+
+    def __init__(self) -> None:
+        self._ia_ptr: int = 0
+        self._create_fn = None
+        self._release_fn = None
+        self._bind_fn = None
+        self._compute_fn = None
+        self._log_count = 0
+
+    # -- lifecycle -----------------------------------------------------------
+
+    def initialize(self) -> bool:
+        """Acquire the cubric ``IAdapter`` from the carb framework."""
+        # Ensure the omni.cubric extension (native carb plugin) is loaded.
+        try:
+            import omni.kit.app
+
+            ext_mgr = omni.kit.app.get_app().get_extension_manager()
+            cubric_enabled = ext_mgr.is_extension_enabled("omni.cubric")
+            logger.info("omni.cubric extension enabled: %s", cubric_enabled)
+            if not cubric_enabled:
+                logger.info("Enabling omni.cubric extension")
+                ext_mgr.set_extension_enabled_immediate("omni.cubric", True)
+                cubric_enabled = ext_mgr.is_extension_enabled("omni.cubric")
+                logger.info("omni.cubric after enable: %s", cubric_enabled)
+        except Exception as exc:
+            logger.warning("Cannot enable omni.cubric: %s", exc)
+            return False
+
+        # Get Framework* via libcarb.so acquireFramework (singleton).
+        try:
+            libcarb = ctypes.CDLL("libcarb.so")
+        except OSError:
+            logger.warning("Could not load libcarb.so")
+            return False
+
+        # Check which symbols libcarb exports for framework access
+        for sym_name in ("acquireFramework", "carbGetSdkVersion", "isFrameworkValid"):
+            try:
+                sym = getattr(libcarb, sym_name, None)
+                logger.info("libcarb.%s: %s", sym_name, "found" if sym else "missing")
+            except Exception:
+                logger.info("libcarb.%s: not accessible", sym_name)
+
+        # Verify framework is alive
+        try:
+            libcarb.isFrameworkValid.restype = ctypes.c_bool
+            libcarb.isFrameworkValid.argtypes = []
+            fw_valid = libcarb.isFrameworkValid()
+            logger.info("isFrameworkValid() = %s", fw_valid)
+        except Exception as exc:
+            logger.warning("isFrameworkValid() failed: %s", exc)
+
+        # Get SDK version for diagnostics
+        try:
+            libcarb.carbGetSdkVersion.restype = ctypes.c_char_p
+            libcarb.carbGetSdkVersion.argtypes = []
+            sdk_ver = libcarb.carbGetSdkVersion()
+            logger.info("carbGetSdkVersion() = %s", sdk_ver)
+        except Exception as exc:
+            logger.info("carbGetSdkVersion() failed: %s", exc)
+
+        libcarb.acquireFramework.restype = ctypes.c_void_p
+        libcarb.acquireFramework.argtypes = [ctypes.c_char_p, _Version]
+        fw_ptr = libcarb.acquireFramework(b"isaaclab.cubric", _Version(0, 0))
+        if not fw_ptr:
+            logger.warning("acquireFramework returned null")
+            return False
+        logger.info("carb Framework* = 0x%016x", fw_ptr)
+
+        # Dump first several framework function pointers for diagnosis
+        fw_fn_names = [
+            "loadPluginsEx",
+            "unloadAllPlugins",
+            "acquireInterfaceWithClient",
+            "tryAcquireInterfaceWithClient",
+        ]
+        _dump_fn_ptrs(fw_ptr, fw_fn_names, "Framework")
+
+        # Read tryAcquireInterfaceWithClient fn-ptr from Framework.
+        try_acquire_addr = _read_u64(fw_ptr + _FW_OFF_TRY_ACQUIRE)
+        if try_acquire_addr == 0:
+            logger.warning("tryAcquireInterfaceWithClient is null in Framework")
+            return False
+        logger.info("tryAcquireInterfaceWithClient addr = 0x%016x", try_acquire_addr)
+
+        try_acquire_fn = ctypes.CFUNCTYPE(
+            ctypes.c_void_p,   # return: void* (IAdapter*)
+            ctypes.c_char_p,   # clientName
+            _InterfaceDesc,    # desc (by value)
+            ctypes.c_char_p,   # pluginName
+        )(try_acquire_addr)
+
+        desc = _InterfaceDesc(
+            name=b"omni::cubric::IAdapter",
+            version=_Version(0, 1),
+        )
+        logger.info(
+            "Calling tryAcquireInterfaceWithClient("
+            "client=%r, iface=%r, ver=%d.%d, plugin=%r)",
+            b"carb.scripting-python.plugin",
+            desc.name,
+            desc.version.major,
+            desc.version.minor,
+            None,
+        )
+        ia_ptr = try_acquire_fn(b"carb.scripting-python.plugin", desc, None)
+        if not ia_ptr:
+            # Try without client name restriction
+            logger.info("First attempt returned null; retrying with client=None")
+            ia_ptr = try_acquire_fn(None, desc, None)
+        if not ia_ptr:
+            # Try acquireInterfaceWithClient (offset 16) which logs errors
+            logger.info("tryAcquire returned null; trying acquireInterfaceWithClient")
+            acquire_addr = _read_u64(fw_ptr + 16)
+            if acquire_addr:
+                acquire_fn = ctypes.CFUNCTYPE(
+                    ctypes.c_void_p,
+                    ctypes.c_char_p,
+                    _InterfaceDesc,
+                    ctypes.c_char_p,
+                )(acquire_addr)
+                ia_ptr = acquire_fn(b"isaaclab.cubric", desc, None)
+        if not ia_ptr:
+            logger.warning(
+                "All IAdapter acquisition attempts returned null — "
+                "cubric plugin may not be registered or interface mismatch"
+            )
+            return False
+        self._ia_ptr = ia_ptr
+        logger.info("IAdapter* = 0x%016x", ia_ptr)
+
+        # Dump all IAdapter function pointers
+        ia_fn_names = [
+            "getAttribute", "create", "refcount", "retain",
+            "release", "bindToStage", "unbind", "compute",
+        ]
+        _dump_fn_ptrs(ia_ptr, ia_fn_names, "IAdapter")
+
+        # Wrap the four IAdapter function pointers we need.
+        create_addr = _read_u64(ia_ptr + _IA_OFF_CREATE)
+        release_addr = _read_u64(ia_ptr + _IA_OFF_RELEASE)
+        bind_addr = _read_u64(ia_ptr + _IA_OFF_BIND)
+        compute_addr = _read_u64(ia_ptr + _IA_OFF_COMPUTE)
+
+        if not all([create_addr, release_addr, bind_addr, compute_addr]):
+            logger.warning("One or more IAdapter function pointers are null")
+            return False
+
+        # create(AdapterId* out) -> bool
+        self._create_fn = ctypes.CFUNCTYPE(
+            ctypes.c_bool, ctypes.POINTER(ctypes.c_uint64),
+        )(create_addr)
+
+        # release(AdapterId) -> bool
+        self._release_fn = ctypes.CFUNCTYPE(
+            ctypes.c_bool, ctypes.c_uint64,
+        )(release_addr)
+
+        # bindToStage(AdapterId, const FabricId&) -> bool
+        # FabricId is uint64, passed by const-ref → pointer on x86_64
+        self._bind_fn = ctypes.CFUNCTYPE(
+            ctypes.c_bool, ctypes.c_uint64, ctypes.POINTER(ctypes.c_uint64),
+        )(bind_addr)
+
+        # compute(AdapterId, options, dirtyMode, outAccountFlags*) -> bool
+        self._compute_fn = ctypes.CFUNCTYPE(
+            ctypes.c_bool,
+            ctypes.c_uint64,   # adapterId
+            ctypes.c_uint32,   # options  (AdapterComputeOptions)
+            ctypes.c_int32,    # dirtyMode (AdapterDirtyMode)
+            ctypes.c_void_p,   # outAccountFlags* (nullable)
+        )(compute_addr)
+
+        logger.info(
+            "cubric IAdapter bindings ready (opts=0x%x [eRigidBody|eForceUpdate], dirty=%d [eAll])",
+            _OPT_DEFAULT,
+            _DIRTY_ALL,
+        )
+        return True
+
+    @property
+    def available(self) -> bool:
+        return self._ia_ptr != 0
+
+    # -- cubric adapter methods ----------------------------------------------
+
+    def create_adapter(self) -> int | None:
+        """Create a cubric adapter. Returns an adapter ID or ``None``."""
+        if not self._create_fn:
+            return None
+        adapter_id = ctypes.c_uint64(_INVALID_ADAPTER_ID)
+        ok = self._create_fn(ctypes.byref(adapter_id))
+        if not ok or adapter_id.value == _INVALID_ADAPTER_ID:
+            logger.warning("IAdapter::create failed")
+            return None
+        logger.info("cubric adapter created (id=%d)", adapter_id.value)
+        return adapter_id.value
+
+    def bind_to_stage(self, adapter_id: int, fabric_id: int) -> bool:
+        """Bind the adapter to a Fabric stage."""
+        if not self._bind_fn:
+            return False
+        fid = ctypes.c_uint64(fabric_id)
+        ok = self._bind_fn(adapter_id, ctypes.byref(fid))
+        if not ok:
+            logger.warning("IAdapter::bindToStage failed (adapter=%d, fabricId=%d)", adapter_id, fabric_id)
+        elif self._log_count < 3:
+            logger.info("IAdapter::bindToStage ok (adapter=%d, fabricId=%d)", adapter_id, fabric_id)
+        return ok
+
+    def compute(self, adapter_id: int) -> bool:
+        """Run the GPU transform-hierarchy compute pass.
+
+        Uses ``eRigidBody | eForceUpdate`` with ``eAll`` dirty mode.
+        ``eRigidBody`` makes cubric apply Inverse propagation on buckets
+        tagged with ``PhysicsRigidBodyAPI`` (keeps Newton's world transforms,
+        derives local) and Forward on everything else (propagates to children).
+        ``eForceUpdate`` bypasses the change-listener dirty check.
+        """
+        if not self._compute_fn:
+            return False
+        flags = ctypes.c_uint32(0)
+        ok = self._compute_fn(adapter_id, _OPT_DEFAULT, _DIRTY_ALL, ctypes.byref(flags))
+        if not ok:
+            logger.warning("IAdapter::compute returned false (flags=0x%x)", flags.value)
+        return ok
+
+    def release_adapter(self, adapter_id: int) -> None:
+        """Release an adapter."""
+        if not adapter_id or not self._release_fn:
+            return
+        self._release_fn(adapter_id)
diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
index cd9be10c49a1..50bf97e70e5a 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
@@ -104,6 +104,10 @@ class NewtonManager(PhysicsManager):
     _clone_physics_only = False
     _transforms_dirty: bool = False
 
+    # cubric GPU transform hierarchy (replaces CPU update_world_xforms)
+    _cubric = None
+    _cubric_adapter: int | None = None
+
     # Model changes (callbacks use unified system from PhysicsManager)
     _model_changes: set[int] = set()
 
@@ -174,6 +178,12 @@ def sync_transforms_to_usd(cls) -> None:
         Uses ``wp.fabricarray`` directly (no ``isaacsim.physics.newton`` extension needed).
         The Warp kernel reads ``state_0.body_q[newton_index[i]]`` and writes the
         corresponding ``mat44d`` to ``omni:fabric:worldMatrix`` for each prim.
+
+        When cubric is available the method mirrors PhysX's ``DirectGpuHelper``
+        pattern: pause Fabric change tracking, write transforms, resume tracking,
+        then call ``IAdapter::compute`` on the GPU to propagate the hierarchy and
+        notify the Fabric Scene Delegate.  Otherwise it falls back to the CPU
+        ``update_world_xforms()`` path.
         """
         if cls._usdrt_stage is None or cls._model is None or cls._state_0 is None:
             return
@@ -182,32 +192,70 @@ def sync_transforms_to_usd(cls) -> None:
         try:
             import usdrt
 
-            selection = cls._usdrt_stage.SelectPrims(
-                require_attrs=[
-                    (usdrt.Sdf.ValueTypeNames.Matrix4d, "omni:fabric:worldMatrix", usdrt.Usd.Access.ReadWrite),
-                    (usdrt.Sdf.ValueTypeNames.UInt, cls._newton_index_attr, usdrt.Usd.Access.Read),
-                ],
-                device=str(PhysicsManager._device),
-            )
-            if selection.GetCount() == 0:
-                return
-            fabric_transforms = wp.fabricarray(selection, "omni:fabric:worldMatrix")
-            newton_indices = wp.fabricarray(selection, cls._newton_index_attr)
-            wp.launch(
-                _set_fabric_transforms,
-                dim=newton_indices.shape[0],
-                inputs=[fabric_transforms, newton_indices, cls._state_0.body_q],
-                device=PhysicsManager._device,
-            )
-            wp.synchronize_device(PhysicsManager._device)
+            # Lazy adapter creation: deferred from initialize_solver() to avoid
+            # startup-ordering issues with the cubric plugin.
+            if cls._cubric is not None and cls._cubric.available and cls._cubric_adapter is None:
+                cls._cubric_adapter = cls._cubric.create_adapter()
+                if cls._cubric_adapter is not None:
+                    logger.info("cubric GPU transform hierarchy enabled")
+                else:
+                    logger.warning("cubric adapter creation failed; falling back to update_world_xforms()")
+                    cls._cubric = None
+
+            use_cubric = cls._cubric is not None and cls._cubric_adapter is not None
+
+            fabric_hierarchy = None
             if hasattr(usdrt, "hierarchy"):
                 fabric_hierarchy = usdrt.hierarchy.IFabricHierarchy().get_fabric_hierarchy(
                     cls._usdrt_stage.GetFabricId(), cls._usdrt_stage.GetStageIdAsStageId()
                 )
-                fabric_hierarchy.update_world_xforms()
-            cls._transforms_dirty = False
-        except Exception as exc:
-            logger.debug("[NewtonManager] sync_transforms_to_usd: %s", exc)
+
+            # Pause hierarchy change tracking BEFORE SelectPrims.
+            # SelectPrims with ReadWrite access calls getAttributeArrayGpu
+            # internally, which marks Fabric buffers dirty.  If tracking is
+            # still active at that point the hierarchy records the change and
+            # Kit's updateWorldXforms will do an expensive connectivity
+            # rebuild every frame.  PhysX avoids this via ScopedUSDRT which
+            # pauses tracking before any Fabric writes.
+            if use_cubric and fabric_hierarchy is not None:
+                fabric_hierarchy.track_world_xform_changes(False)
+                fabric_hierarchy.track_local_xform_changes(False)
+
+            try:
+                selection = cls._usdrt_stage.SelectPrims(
+                    require_attrs=[
+                        (usdrt.Sdf.ValueTypeNames.Matrix4d, "omni:fabric:worldMatrix", usdrt.Usd.Access.ReadWrite),
+                        (usdrt.Sdf.ValueTypeNames.UInt, cls._newton_index_attr, usdrt.Usd.Access.Read),
+                    ],
+                    device=str(PhysicsManager._device),
+                )
+                if selection.GetCount() == 0:
+                    return
+
+                fabric_transforms = wp.fabricarray(selection, "omni:fabric:worldMatrix")
+                newton_indices = wp.fabricarray(selection, cls._newton_index_attr)
+                wp.launch(
+                    _set_fabric_transforms,
+                    dim=newton_indices.shape[0],
+                    inputs=[fabric_transforms, newton_indices, cls._state_0.body_q],
+                    device=PhysicsManager._device,
+                )
+                wp.synchronize_device(PhysicsManager._device)
+
+                cls._transforms_dirty = False
+
+                if use_cubric and fabric_hierarchy is not None:
+                    fabric_id = cls._usdrt_stage.GetFabricId().id
+                    cls._cubric.bind_to_stage(cls._cubric_adapter, fabric_id)
+                    cls._cubric.compute(cls._cubric_adapter)
+                elif fabric_hierarchy is not None:
+                    fabric_hierarchy.update_world_xforms()
+            finally:
+                if use_cubric and fabric_hierarchy is not None:
+                    fabric_hierarchy.track_world_xform_changes(True)
+                    fabric_hierarchy.track_local_xform_changes(True)
+        except Exception:
+            logger.exception("[NewtonManager] sync_transforms_to_usd FAILED")
 
     @classmethod
     def _mark_transforms_dirty(cls) -> None:
@@ -288,6 +336,10 @@ def is_fabric_enabled(cls) -> bool:
     @classmethod
     def clear(cls):
         """Clear all Newton-specific state (callbacks cleared by super().close())."""
+        if cls._cubric is not None and cls._cubric_adapter is not None:
+            cls._cubric.release_adapter(cls._cubric_adapter)
+        cls._cubric = None
+        cls._cubric_adapter = None
         cls._builder = None
         cls._model = None
         cls._solver = None
@@ -365,6 +417,10 @@ def start_simulation(cls) -> None:
                 prim = cls._usdrt_stage.GetPrimAtPath(prim_path)
                 prim.CreateAttribute(cls._newton_index_attr, usdrt.Sdf.ValueTypeNames.UInt, True)
                 prim.GetAttribute(cls._newton_index_attr).Set(i)
+                # Tag with PhysicsRigidBodyAPI so cubric's eRigidBody mode
+                # applies Inverse propagation (preserves Newton's world
+                # transforms and derives local) instead of Forward.
+                prim.AddAppliedSchema("PhysicsRigidBodyAPI")
                 xformable_prim = usdrt.Rt.Xformable(prim)
                 if not xformable_prim.HasWorldXform():
                     xformable_prim.SetWorldXformFromUsd()
@@ -529,6 +585,20 @@ def initialize_solver(cls) -> None:
             # Initialize contacts and collision pipeline
             cls._initialize_contacts()
 
+        # Prepare cubric ctypes bindings (acquires IAdapter from carb framework).
+        # Adapter creation is deferred to the first sync_transforms_to_usd() call
+        # at render time to avoid any startup-ordering issues with the cubric
+        # plugin initialisation.
+        if cls._usdrt_stage is not None:
+            from isaaclab_newton.physics._cubric import CubricBindings
+
+            cls._cubric = CubricBindings()
+            if cls._cubric.initialize():
+                logger.info("cubric bindings ready (adapter deferred to first render)")
+            else:
+                logger.warning("cubric bindings init failed; falling back to update_world_xforms()")
+                cls._cubric = None
+
         device = PhysicsManager._device
 
         use_cuda_graph = cfg.use_cuda_graph and "cuda" in device  # type: ignore[union-attr]

From c383820deb63a149020784a60eef974cb7e87dd6 Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Fri, 13 Mar 2026 09:17:49 -0700
Subject: [PATCH 6/8] Remove debugging code.

---
 .../isaaclab_newton/physics/_cubric.py        | 105 +++---------------
 1 file changed, 14 insertions(+), 91 deletions(-)

diff --git a/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py
index 68453ee77a93..f9c4d0da4454 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/_cubric.py
@@ -93,14 +93,6 @@ def _read_u64(addr: int) -> int:
     return ctypes.c_uint64.from_address(addr).value
 
 
-def _dump_fn_ptrs(base: int, names: list[str], label: str) -> None:
-    """Log function pointer values at 8-byte intervals from *base*."""
-    for i, name in enumerate(names):
-        addr = _read_u64(base + i * 8)
-        tag = "OK" if addr else "NULL"
-        logger.info("  %s+%d (%s) = 0x%016x  [%s]", label, i * 8, name, addr, tag)
-
-
 # ---------------------------------------------------------------------------
 #  Public API
 # ---------------------------------------------------------------------------
@@ -117,7 +109,6 @@ def __init__(self) -> None:
         self._release_fn = None
         self._bind_fn = None
         self._compute_fn = None
-        self._log_count = 0
 
     # -- lifecycle -----------------------------------------------------------
 
@@ -128,13 +119,11 @@ def initialize(self) -> bool:
             import omni.kit.app
 
             ext_mgr = omni.kit.app.get_app().get_extension_manager()
-            cubric_enabled = ext_mgr.is_extension_enabled("omni.cubric")
-            logger.info("omni.cubric extension enabled: %s", cubric_enabled)
-            if not cubric_enabled:
-                logger.info("Enabling omni.cubric extension")
+            if not ext_mgr.is_extension_enabled("omni.cubric"):
                 ext_mgr.set_extension_enabled_immediate("omni.cubric", True)
-                cubric_enabled = ext_mgr.is_extension_enabled("omni.cubric")
-                logger.info("omni.cubric after enable: %s", cubric_enabled)
+            if not ext_mgr.is_extension_enabled("omni.cubric"):
+                logger.warning("Failed to enable omni.cubric extension")
+                return False
         except Exception as exc:
             logger.warning("Cannot enable omni.cubric: %s", exc)
             return False
@@ -146,55 +135,18 @@ def initialize(self) -> bool:
             logger.warning("Could not load libcarb.so")
             return False
 
-        # Check which symbols libcarb exports for framework access
-        for sym_name in ("acquireFramework", "carbGetSdkVersion", "isFrameworkValid"):
-            try:
-                sym = getattr(libcarb, sym_name, None)
-                logger.info("libcarb.%s: %s", sym_name, "found" if sym else "missing")
-            except Exception:
-                logger.info("libcarb.%s: not accessible", sym_name)
-
-        # Verify framework is alive
-        try:
-            libcarb.isFrameworkValid.restype = ctypes.c_bool
-            libcarb.isFrameworkValid.argtypes = []
-            fw_valid = libcarb.isFrameworkValid()
-            logger.info("isFrameworkValid() = %s", fw_valid)
-        except Exception as exc:
-            logger.warning("isFrameworkValid() failed: %s", exc)
-
-        # Get SDK version for diagnostics
-        try:
-            libcarb.carbGetSdkVersion.restype = ctypes.c_char_p
-            libcarb.carbGetSdkVersion.argtypes = []
-            sdk_ver = libcarb.carbGetSdkVersion()
-            logger.info("carbGetSdkVersion() = %s", sdk_ver)
-        except Exception as exc:
-            logger.info("carbGetSdkVersion() failed: %s", exc)
-
         libcarb.acquireFramework.restype = ctypes.c_void_p
         libcarb.acquireFramework.argtypes = [ctypes.c_char_p, _Version]
         fw_ptr = libcarb.acquireFramework(b"isaaclab.cubric", _Version(0, 0))
         if not fw_ptr:
             logger.warning("acquireFramework returned null")
             return False
-        logger.info("carb Framework* = 0x%016x", fw_ptr)
-
-        # Dump first several framework function pointers for diagnosis
-        fw_fn_names = [
-            "loadPluginsEx",
-            "unloadAllPlugins",
-            "acquireInterfaceWithClient",
-            "tryAcquireInterfaceWithClient",
-        ]
-        _dump_fn_ptrs(fw_ptr, fw_fn_names, "Framework")
-
-        # Read tryAcquireInterfaceWithClient fn-ptr from Framework.
+
+        # Read tryAcquireInterfaceWithClient fn-ptr from Framework vtable.
         try_acquire_addr = _read_u64(fw_ptr + _FW_OFF_TRY_ACQUIRE)
         if try_acquire_addr == 0:
             logger.warning("tryAcquireInterfaceWithClient is null in Framework")
             return False
-        logger.info("tryAcquireInterfaceWithClient addr = 0x%016x", try_acquire_addr)
 
         try_acquire_fn = ctypes.CFUNCTYPE(
             ctypes.c_void_p,   # return: void* (IAdapter*)
@@ -207,24 +159,14 @@ def initialize(self) -> bool:
             name=b"omni::cubric::IAdapter",
             version=_Version(0, 1),
         )
-        logger.info(
-            "Calling tryAcquireInterfaceWithClient("
-            "client=%r, iface=%r, ver=%d.%d, plugin=%r)",
-            b"carb.scripting-python.plugin",
-            desc.name,
-            desc.version.major,
-            desc.version.minor,
-            None,
-        )
+
+        # Try several acquisition strategies — the required client name
+        # varies across Kit configurations.
         ia_ptr = try_acquire_fn(b"carb.scripting-python.plugin", desc, None)
         if not ia_ptr:
-            # Try without client name restriction
-            logger.info("First attempt returned null; retrying with client=None")
             ia_ptr = try_acquire_fn(None, desc, None)
         if not ia_ptr:
-            # Try acquireInterfaceWithClient (offset 16) which logs errors
-            logger.info("tryAcquire returned null; trying acquireInterfaceWithClient")
-            acquire_addr = _read_u64(fw_ptr + 16)
+            acquire_addr = _read_u64(fw_ptr + 16)  # acquireInterfaceWithClient
             if acquire_addr:
                 acquire_fn = ctypes.CFUNCTYPE(
                     ctypes.c_void_p,
@@ -235,19 +177,11 @@ def initialize(self) -> bool:
                 ia_ptr = acquire_fn(b"isaaclab.cubric", desc, None)
         if not ia_ptr:
             logger.warning(
-                "All IAdapter acquisition attempts returned null — "
-                "cubric plugin may not be registered or interface mismatch"
+                "Could not acquire omni::cubric::IAdapter — "
+                "cubric plugin may not be registered or interface version mismatch"
             )
             return False
         self._ia_ptr = ia_ptr
-        logger.info("IAdapter* = 0x%016x", ia_ptr)
-
-        # Dump all IAdapter function pointers
-        ia_fn_names = [
-            "getAttribute", "create", "refcount", "retain",
-            "release", "bindToStage", "unbind", "compute",
-        ]
-        _dump_fn_ptrs(ia_ptr, ia_fn_names, "IAdapter")
 
         # Wrap the four IAdapter function pointers we need.
         create_addr = _read_u64(ia_ptr + _IA_OFF_CREATE)
@@ -259,23 +193,19 @@ def initialize(self) -> bool:
             logger.warning("One or more IAdapter function pointers are null")
             return False
 
-        # create(AdapterId* out) -> bool
         self._create_fn = ctypes.CFUNCTYPE(
             ctypes.c_bool, ctypes.POINTER(ctypes.c_uint64),
         )(create_addr)
 
-        # release(AdapterId) -> bool
         self._release_fn = ctypes.CFUNCTYPE(
             ctypes.c_bool, ctypes.c_uint64,
         )(release_addr)
 
-        # bindToStage(AdapterId, const FabricId&) -> bool
-        # FabricId is uint64, passed by const-ref → pointer on x86_64
+        # FabricId is uint64, passed by const-ref -> pointer on x86_64
         self._bind_fn = ctypes.CFUNCTYPE(
             ctypes.c_bool, ctypes.c_uint64, ctypes.POINTER(ctypes.c_uint64),
         )(bind_addr)
 
-        # compute(AdapterId, options, dirtyMode, outAccountFlags*) -> bool
         self._compute_fn = ctypes.CFUNCTYPE(
             ctypes.c_bool,
             ctypes.c_uint64,   # adapterId
@@ -284,11 +214,7 @@ def initialize(self) -> bool:
             ctypes.c_void_p,   # outAccountFlags* (nullable)
         )(compute_addr)
 
-        logger.info(
-            "cubric IAdapter bindings ready (opts=0x%x [eRigidBody|eForceUpdate], dirty=%d [eAll])",
-            _OPT_DEFAULT,
-            _DIRTY_ALL,
-        )
+        logger.info("cubric IAdapter bindings ready")
         return True
 
     @property
@@ -306,7 +232,6 @@ def create_adapter(self) -> int | None:
         if not ok or adapter_id.value == _INVALID_ADAPTER_ID:
             logger.warning("IAdapter::create failed")
             return None
-        logger.info("cubric adapter created (id=%d)", adapter_id.value)
         return adapter_id.value
 
     def bind_to_stage(self, adapter_id: int, fabric_id: int) -> bool:
@@ -317,8 +242,6 @@ def bind_to_stage(self, adapter_id: int, fabric_id: int) -> bool:
         ok = self._bind_fn(adapter_id, ctypes.byref(fid))
         if not ok:
             logger.warning("IAdapter::bindToStage failed (adapter=%d, fabricId=%d)", adapter_id, fabric_id)
-        elif self._log_count < 3:
-            logger.info("IAdapter::bindToStage ok (adapter=%d, fabricId=%d)", adapter_id, fabric_id)
         return ok
 
     def compute(self, adapter_id: int) -> bool:

From b9870dbca1d298902153c079277d1093e8882791 Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Fri, 13 Mar 2026 09:22:22 -0700
Subject: [PATCH 7/8] Revert "write images"

This reverts commit 46830e6a6e3150da5f0fb378528ab78253e0dea0.
---
 .../renderers/isaac_rtx_renderer.py           | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
index 143ca6643e91..22b07f13def0 100644
--- a/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
+++ b/source/isaaclab_physx/isaaclab_physx/renderers/isaac_rtx_renderer.py
@@ -8,9 +8,7 @@
 from __future__ import annotations
 
 import json
-import logging
 import math
-import os
 import weakref
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
@@ -50,12 +48,6 @@ class IsaacRtxRenderData:
     sensor: SensorBase | None = None
 
 
-_logger = logging.getLogger(__name__)
-
-_TILED_DUMP_DIR = "/tmp/il-rtx"
-_tiled_frame_counter = 0
-
-
 class IsaacRtxRenderer(BaseRenderer):
     """Isaac RTX backend using Omniverse Replicator for tiled camera rendering.
 
@@ -256,10 +248,6 @@ def tiling_grid_shape():
             if data_type in SIMPLE_SHADING_MODES:
                 tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
 
-            # Dump the tiled image to disk (first visual data type per frame only)
-            #if tiled_data_buffer.dtype == wp.uint8:
-            #    self._save_tiled_image(tiled_data_buffer)
-
             wp.launch(
                 kernel=reshape_tiled_image,
                 dim=(view_count, cfg.height, cfg.width),
@@ -292,20 +280,6 @@ def tiling_grid_shape():
                     0.0 if cfg.depth_clipping_behavior == "zero" else cfg.spawn.clipping_range[1]
                 )
 
-    def _save_tiled_image(self, tiled_data: wp.array):
-        """Save the raw tiled render product to disk as a PNG."""
-        global _tiled_frame_counter
-        os.makedirs(_TILED_DUMP_DIR, exist_ok=True)
-        path = os.path.join(_TILED_DUMP_DIR, f"frame_{_tiled_frame_counter:06d}.png")
-        _tiled_frame_counter += 1
-        try:
-            from PIL import Image
-
-            img_np = wp.to_torch(tiled_data).cpu().numpy()
-            Image.fromarray(img_np).save(path)
-        except Exception:
-            _logger.warning("Failed to save tiled image to %s", path, exc_info=True)
-
     def write_output(self, render_data: IsaacRtxRenderData, output_name: str, output_data: torch.Tensor):
         """No-op for Isaac RTX - all outputs written in render().
         See :meth:`~isaaclab.renderers.base_renderer.BaseRenderer.write_output`."""

From a0248bc61e941b4401c5fe97f98f056766868cec Mon Sep 17 00:00:00 2001
From: Nathan Cournia <ncournia@nvidia.com>
Date: Fri, 13 Mar 2026 09:59:25 -0700
Subject: [PATCH 8/8] Simplify cuda graph code.

---
 .../isaaclab_newton/physics/newton_manager.py | 41 +++----------------
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
index 50bf97e70e5a..ca0c8b52eac1 100644
--- a/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
+++ b/source/isaaclab_newton/isaaclab_newton/physics/newton_manager.py
@@ -780,44 +780,13 @@ def step_fn(state_0, state_1):
 
     @classmethod
     def _simulate(cls) -> None:
-        """Run one simulation step with substeps."""
+        """Run one simulation step with substeps and USD sync.
 
-        # MJWarp can use its internal collision pipeline.
-        if cls._needs_collision_pipeline:
-            cls._collision_pipeline.collide(cls._state_0, cls._contacts)
-            contacts = cls._contacts
-        else:
-            contacts = None
-
-        def step_fn(state_0, state_1):
-            cls._solver.step(state_0, state_1, cls._control, contacts, cls._solver_dt)
-
-        if cls._use_single_state:
-            for i in range(cls._num_substeps):
-                step_fn(cls._state_0, cls._state_0)
-                cls._state_0.clear_forces()
-        else:
-            cfg = PhysicsManager._cfg
-            need_copy_on_last_substep = (cfg is not None and cfg.use_cuda_graph) and cls._num_substeps % 2 == 1  # type: ignore[union-attr]
-
-            for i in range(cls._num_substeps):
-                step_fn(cls._state_0, cls._state_1)
-                if need_copy_on_last_substep and i == cls._num_substeps - 1:
-                    cls._state_0.assign(cls._state_1)
-                else:
-                    cls._state_0, cls._state_1 = cls._state_1, cls._state_0
-                cls._state_0.clear_forces()
-
-        # Populate contacts for contact sensors
-        if cls._report_contacts:
-            # For newton_contacts (unified pipeline): use locally computed contacts
-            # For mujoco_contacts: use class-level _contacts, solver populates it from MuJoCo data
-            eval_contacts = contacts if contacts is not None else cls._contacts
-            cls._solver.update_contacts(eval_contacts, cls._state_0)
-            for sensor in cls._newton_contact_sensors.values():
-                sensor.update(cls._state_0, eval_contacts)
+        Delegates physics work to :meth:`_simulate_physics_only` and then
+        marks transforms dirty for the next render-cadence sync.
+        """
+        cls._simulate_physics_only()
 
-        # Mark transforms dirty so the next render-cadence sync picks them up.
         if cls._usdrt_stage is not None:
             cls._mark_transforms_dirty()