Skip to content

Commit ccb4db5

Browse files
committed
Merge branch 'GH-688' into 'main'
Fix 2D tile_load fast path conditions See merge request omniverse/warp!1276
2 parents f579744 + db2b587 commit ccb4db5

File tree

3 files changed

+33
-3
lines changed

3 files changed

+33
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
- Fix `OpenGLRenderer.update_shape_instance()` not having color buffers created for the shape instances.
7373
- Allow recovering from out-of-memory errors during Volume allocation ([GH-611](https://github.com/NVIDIA/warp/issues/611)).
7474
- Address `wp.tile_atomic_add()` compiler errors.
75+
- Fix 2D tile load when source array and tile have incompatible strides
7576

7677
## [1.7.0] - 2025-03-30
7778

warp/native/tile.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,14 +1160,15 @@ struct tile_shared_t
11601160
const bool contiguous_dest = dest.data.strides[lastdim] == sizeof(T);
11611161
const int elements = min(Layout::Shape::dim(1), (dest.data.shape[lastdim] - dest.offset[lastdim]));
11621162
const bool aligned_size = (elements*sizeof(T))%sizeof(float4) == 0;
1163-
1163+
const bool aligned_stride = (dest.data.strides[0]/sizeof(T))%Layout::Stride::dim(0) == 0;
1164+
11641165
float4* dest128 = (float4*)&dest.data.data[dest.index_from_coord(tile_coord(0,0))];
11651166
const bool aligned_dst = (uint64_t)(dest128)%sizeof(float4) == 0;
11661167

11671168
constexpr int M = Layout::Shape::dim(0);
11681169
constexpr int N = (Layout::Shape::dim(1)*sizeof(T))/sizeof(float4);
11691170

1170-
if (contiguous_dest && contiguous_src && aligned_size && aligned_dst && N)
1171+
if (contiguous_dest && contiguous_src && aligned_size && aligned_dst && aligned_stride && N)
11711172
{
11721173
// alias of shared tile with 128bit type
11731174
using SrcLayout = tile_layout_strided_t<tile_shape_t<M, N>>;
@@ -1249,14 +1250,15 @@ struct tile_shared_t
12491250
const bool contiguous_src = src.data.strides[lastdim] == sizeof(T);
12501251
const int elements = min(Layout::Shape::dim(1), (src.data.shape[lastdim] - src.offset[lastdim]));
12511252
const bool aligned_size = (elements*sizeof(T))%sizeof(float4) == 0;
1253+
const bool aligned_stride = (src.data.strides[0]/sizeof(T))%Layout::Stride::dim(0) == 0;
12521254

12531255
float4* src128 = (float4*)&src.data.data[src.index_from_coord(tile_coord(0,0))];
12541256
const bool aligned_src = (uint64_t)(src128)%sizeof(float4) == 0;
12551257

12561258
constexpr int M = Layout::Shape::dim(0);
12571259
constexpr int N = (Layout::Shape::dim(1)*sizeof(T))/sizeof(float4);
12581260

1259-
if (contiguous_dest && contiguous_src && aligned_size && aligned_src && N)
1261+
if (contiguous_dest && contiguous_src && aligned_size && aligned_src && aligned_stride && N)
12601262
{
12611263
// alias of shared tile with 128bit type
12621264
using DestLayout = tile_layout_strided_t<tile_shape_t<M, N>>;

warp/tests/tile/test_tile_load.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,32 @@ def test_tile_load_aligned_offset_unaligned_size(test, device):
274274
assert_np_equal(output_array.numpy()[TILE_WIDTH:, :], np.zeros((remaining_height, TILE_M)))
275275

276276

277+
@wp.kernel
278+
def test_tile_load_stride_unaligned_kernel(input: wp.array2d(dtype=wp.float32), output: wp.array2d(dtype=wp.float32)):
279+
tile = wp.tile_load(input, shape=(4, 4))
280+
wp.tile_store(output, tile)
281+
282+
283+
# regression test for float4 aligned tiles that load from a source array with an incommensurate stride
284+
def test_tile_load_stride_unaligned(test, device):
285+
DIM = 5
286+
input_np = np.eye(DIM) * 2.0
287+
input_array = wp.array(input_np, dtype=wp.float32, device=device)
288+
output_array = wp.zeros_like(input_array)
289+
290+
wp.launch_tiled(
291+
test_tile_load_stride_unaligned_kernel,
292+
dim=(1, 1),
293+
inputs=[input_array],
294+
outputs=[output_array],
295+
block_dim=TILE_DIM,
296+
device=device,
297+
)
298+
299+
input_np[DIM - 1, DIM - 1] = 0.0
300+
assert_np_equal(output_array.numpy(), input_np)
301+
302+
277303
# ----------------------------------------------------------------------------------------
278304

279305
TILE_SIZE = 4
@@ -485,6 +511,7 @@ class TestTileLoad(unittest.TestCase):
485511
test_tile_load_aligned_offset_unaligned_size,
486512
devices=devices,
487513
)
514+
add_function_test(TestTileLoad, "test_tile_load_stride_unaligned", test_tile_load_stride_unaligned, devices=devices)
488515

489516
add_function_test(TestTileLoad, "test_tile_extract_1d", test_tile_extract(tile_extract_1d_kernel, 1), devices=devices)
490517
add_function_test(TestTileLoad, "test_tile_extract_2d", test_tile_extract(tile_extract_2d_kernel, 2), devices=devices)

0 commit comments

Comments
 (0)