Skip to content

Commit 4e838fd

Browse files
authored
add more fused problems (#36)
* add problems * fix
1 parent c0194df commit 4e838fd

File tree

12 files changed

+118
-169
lines changed

12 files changed

+118
-169
lines changed

problems/box-blur/problem.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,3 @@ This creates a blurring effect by smoothing out pixel values. The larger the ker
5252
## Notes:
5353
- The input tensor is a single-channel grayscale image
5454
- Handle edge cases by only averaging available pixels (no padding)
55-
- For pixels near the border, use a smaller effective kernel
56-
- The kernel size should be odd (3x3, 5x5, 7x7, etc.)
57-
- This is a fundamental operation in computer graphics and image processing
Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,20 @@ def reference_solution(self, input_image: torch.Tensor, kernel: torch.Tensor) ->
2525
Result of conv2d -> ReLU -> HardSwish fusion
2626
"""
2727
with torch.no_grad(), torch.autocast("cuda", enabled=False, dtype=torch.float32):
28-
# Ensure kernel sizes are odd
29-
assert kernel.size(0) % 2 == 1, "Kernel height must be odd"
30-
assert kernel.size(1) % 2 == 1, "Kernel width must be odd"
31-
32-
# Perform 2D convolution using PyTorch's built-in function
33-
# Convert to shape expected by conv2d: [batch, channels, height, width]
3428
input_reshaped = input_image.view(1, 1, input_image.size(0), input_image.size(1))
3529
kernel_reshaped = kernel.view(1, 1, kernel.size(0), kernel.size(1))
3630

37-
# Calculate padding size to maintain the same output size
3831
padding_h = kernel.size(0) // 2
3932
padding_w = kernel.size(1) // 2
4033

41-
# Perform convolution
4234
conv_result = torch.nn.functional.conv2d(
4335
input_reshaped,
4436
kernel_reshaped,
4537
padding=(padding_h, padding_w)
4638
)
4739

48-
# Reshape back to original dimensions
4940
conv_result = conv_result.view(input_image.size(0), input_image.size(1))
50-
51-
# Apply ReLU activation
52-
relu_result = torch.nn.functional.relu(conv_result)
53-
54-
# Apply HardSwish activation: x * ReLU6(x + 3) / 6
41+
relu_result = torch.nn.functional.relu(conv_result)
5542
hardswish_result = relu_result * torch.nn.functional.relu6(relu_result + 3) / 6
5643

5744
return hardswish_result
@@ -94,7 +81,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
9481
Returns:
9582
A test case dictionary
9683
"""
97-
h, w, kh, kw = (4, 4, 3, 3) # Sample configuration (kernel dims must be odd)
84+
h, w, kh, kw = (4, 4, 3, 3)
9885
return {
9986
"name": f"H={h}, W={w}, Kh={kh}, Kw={kw}",
10087
"height": h,
@@ -190,9 +177,6 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
190177
Returns:
191178
Number of floating point operations
192179
"""
193-
# Conv2D FLOPS = 2 * H * W * Kh * Kw
194-
# ReLU FLOPS = 0 (comparison operation only)
195-
# HardSwish FLOPS = 5 * H * W (add, relu6, mul, div, mul)
196180
H = test_case["height"]
197181
W = test_case["width"]
198182
Kh = test_case["kernel_height"]
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
---
2+
slug: "conv2d-relu-hardswish"
3+
title: "2D Convolution with ReLU and HardSwish"
4+
difficulty: "MEDIUM"
5+
author: "sarthak"
6+
tags: ["convolution", "activation-function", "fused"]
7+
parameters:
8+
- name: "image"
9+
type: "[VAR]"
10+
pointer: "true"
11+
const: "true"
12+
13+
- name: "kernel"
14+
type: "[VAR]"
15+
pointer: "true"
16+
const: "true"
17+
18+
- name: "output"
19+
type: "[VAR]"
20+
pointer: "true"
21+
const: "false"
22+
23+
- name: "H"
24+
type: "size_t"
25+
pointer: "false"
26+
constant: "false"
27+
28+
- name: "W"
29+
type: "size_t"
30+
pointer: "false"
31+
constant: "false"
32+
33+
- name: "Kh"
34+
type: "size_t"
35+
pointer: "false"
36+
constant: "false"
37+
38+
- name: "Kw"
39+
type: "size_t"
40+
pointer: "false"
41+
constant: "false"
42+
---
43+
44+
Perform a 2D convolution followed by ReLU activation followed by HardSwish activation:
45+
46+
1. **2D Convolution**:
47+
$$C[i][j] = \sum_{u=0}^{K_h-1} \sum_{v=0}^{K_w-1} I\left[i+u-\frac{K_h-1}{2}\right]\left[j+v-\frac{K_w-1}{2}\right] \cdot K[u][v]$$
48+
49+
2. **ReLU Activation**:
50+
$$R[i][j] = \max(0, C[i][j])$$
51+
52+
3. **HardSwish Activation**:
53+
$$O[i][j] = R[i][j] \cdot \frac{\text{ReLU6}(R[i][j] + 3)}{6}$$
54+
55+
where $\text{ReLU6}(x) = \min(6, \max(0, x))$.
56+
57+
## Input
58+
- `image` of size $H \times W$
59+
- `kernel` of size $K_h \times K_w$ (both dimensions must be odd)
60+
61+
## Output
62+
- `output` of size $H \times W$
63+
64+
## Notes:
65+
- Use zero padding at the boundaries where the kernel extends beyond the input image
66+
- Both kernel height $K_h$ and kernel width $K_w$ will be odd integers

problems/edge-detect/problem.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,3 @@ This algorithm computes horizontal and vertical gradients, then combines them to
5555
- The input tensor is a single-channel grayscale image
5656
- Only compute gradients for interior pixels (ignore 1-pixel border)
5757
- Border pixels remain zero in the output
58-
- The output shows edge strength - higher values indicate stronger edges
59-
- This is a fundamental operation in computer vision and image analysis
Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,13 @@ def __init__(self):
1616
def reference_solution(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, alpha: float) -> torch.Tensor:
1717
"""
1818
PyTorch implementation of GEMM followed by element-wise multiplication followed by LeakyReLU.
19-
20-
Args:
21-
A: First input matrix
22-
B: Second input matrix
23-
C: Element-wise multiplication matrix
24-
alpha: LeakyReLU slope parameter
25-
19+
2620
Returns:
2721
Result of LeakyReLU(GEMM(A, B) * C)
2822
"""
2923
with torch.no_grad(), torch.autocast("cuda", enabled=False, dtype=torch.float32):
30-
# GEMM operation: A @ B
3124
gemm_result = torch.matmul(A, B)
32-
33-
# Element-wise multiplication with C
3425
multiply_result = gemm_result * C
35-
36-
# LeakyReLU activation
3726
leaky_relu_result = torch.nn.functional.leaky_relu(multiply_result, alpha)
3827

3928
return leaky_relu_result
@@ -45,8 +34,6 @@ def generate_test_cases(self, dtype: torch.dtype) -> List[Dict[str, Any]]:
4534
Returns:
4635
List of test case dictionaries with varying matrix dimensions
4736
"""
48-
# Matrix dimensions: (M, K) × (K, N) = (M, N)
49-
# dims represents (M, N, K)
5037
test_matrices = [
5138
{
5239
"name": "512x512 x 512x512",
@@ -92,7 +79,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
9279
Returns:
9380
Dictionary containing the sample test case.
9481
"""
95-
m_dims = (4, 4, 4) # M, N, K dimensions
82+
m_dims = (4, 4, 4)
9683
alpha = 0.01
9784
return {
9885
"name": "4x4_square",
@@ -218,5 +205,4 @@ def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
218205
List containing the alpha value and dimensions M, N, K
219206
"""
220207
M, N, K = test_case["dims"]
221-
alpha = test_case["alpha"]
222-
return [alpha, M, N, K]
208+
return [M, N, K]
Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
---
22
slug: "gemm-multiply-leakyrelu"
3-
title: "GEMM + Element-wise Multiply + LeakyReLU Fusion"
4-
difficulty: "HARD"
3+
title: "GEMM with Element-wise Multiply and LeakyReLU"
4+
difficulty: "MEDIUM"
55
author: "sarthak"
6-
tags: ["gemm", "multiply", "leakyrelu", "fusion"]
6+
tags: ["matmul", "activation-function", "fused"]
77
parameters:
8-
- name: "matrix_a"
8+
- name: "A"
99
type: "[VAR]"
1010
pointer: "true"
1111
const: "true"
1212

13-
- name: "matrix_b"
13+
- name: "B"
1414
type: "[VAR]"
1515
pointer: "true"
1616
const: "true"
1717

18-
- name: "matrix_c"
18+
- name: "C"
1919
type: "[VAR]"
2020
pointer: "true"
2121
const: "true"
@@ -25,28 +25,28 @@ parameters:
2525
pointer: "false"
2626
constant: "false"
2727

28-
- name: "output_matrix"
28+
- name: "output"
2929
type: "[VAR]"
3030
pointer: "true"
3131
const: "false"
3232

33-
- name: "m"
33+
- name: "M"
3434
type: "size_t"
3535
pointer: "false"
3636
constant: "false"
3737

38-
- name: "n"
38+
- name: "N"
3939
type: "size_t"
4040
pointer: "false"
4141
constant: "false"
4242

43-
- name: "k"
43+
- name: "K"
4444
type: "size_t"
4545
pointer: "false"
4646
constant: "false"
4747
---
4848

49-
Perform fused GEMM (General Matrix Multiplication) followed by element-wise multiplication followed by LeakyReLU activation:
49+
Perform a GEMM (General Matrix Multiplication) followed by element-wise multiplication followed by LeakyReLU activation:
5050

5151
$$
5252
O[i][j] = \text{LeakyReLU}\left(\left(\sum_{k=0}^{K-1} A[i][k] \cdot B[k][j]\right) \cdot C[i][j], \alpha\right)
@@ -63,13 +63,10 @@ This operation consists of three steps:
6363
- Matrix $A$ of size $M \times K$
6464
- Matrix $B$ of size $K \times N$
6565
- Matrix $C$ of size $M \times N$ (for element-wise multiplication)
66-
- Slope parameter $\alpha$ for LeakyReLU
66+
- $\alpha$ for LeakyReLU
6767

6868
## Output
6969
- Matrix $O$ of size $M \times N$
7070

7171
## Notes:
72-
- All matrices $A$, $B$, $C$, and $O$ are stored in row-major order
73-
- LeakyReLU allows small negative values to pass through, preventing dying neurons
74-
- The fusion of these operations can significantly reduce memory bandwidth requirements
75-
- Consider optimizing for different values of $\alpha$ (typically small positive values like 0.01)
72+
- All matrices $A$, $B$, $C$, and $O$ are stored in row-major order

problems/histogram/problem.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,4 @@ This creates a frequency distribution showing how many pixels have each intensit
5252
## Notes:
5353
- The input tensor contains integer pixel values in range [0, num_bins-1]
5454
- Each histogram bin counts pixels with that specific intensity value
55-
- The sum of all histogram bins equals the total number of pixels
56-
- Histograms are fundamental for image analysis, enhancement, and segmentation
57-
- Handle potential race conditions when multiple threads access the same bin
55+
- The sum of all histogram bins equals the total number of pixels
Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
class matmul_sigmoid_sum(Problem):
9-
"""Matrix multiplication followed by sigmoid activation followed by summation fusion problem."""
9+
"""Matrix multiplication followed by sigmoid activation followed by summation problem."""
1010

1111
def __init__(self):
1212
super().__init__(
@@ -31,13 +31,11 @@ def reference_solution(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
3131

3232
def generate_test_cases(self, dtype: torch.dtype) -> List[Dict[str, Any]]:
3333
"""
34-
Generate test cases for matmul-sigmoid-sum fusion.
34+
Generate test cases for matmul-sigmoid-sum.
3535
3636
Returns:
3737
List of test case dictionaries with varying matrix dimensions
3838
"""
39-
# Matrix dimensions: (M, K) × (K, N) = (M, N)
40-
# dims represents (M, N, K)
4139
test_matrices = [
4240
{
4341
"name": "512x512 x 512x512",
@@ -76,7 +74,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
7674
Returns:
7775
Dictionary containing the sample test case.
7876
"""
79-
m_dims = (4, 4, 4) # M, N, K dimensions
77+
m_dims = (4, 4, 4)
8078
return {
8179
"name": "4x4_square",
8280
"dims": m_dims,
Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,42 @@
11
---
22
slug: "matmul-sigmoid-sum"
3-
title: "Matrix Multiplication + Sigmoid + Sum Fusion"
4-
difficulty: "HARD"
3+
title: "Matrix Multiplication with Sigmoid and Sum"
4+
difficulty: "MEDIUM"
55
author: "sarthak"
6-
tags: ["matmul", "sigmoid", "sum", "fusion"]
6+
tags: ["matmul", "reduction", "fused"]
77
parameters:
8-
- name: "input_a"
8+
- name: "A"
99
type: "[VAR]"
1010
pointer: "true"
1111
const: "true"
1212

13-
- name: "input_b"
13+
- name: "B"
1414
type: "[VAR]"
1515
pointer: "true"
1616
const: "true"
1717

18-
- name: "output_result"
18+
- name: "output"
1919
type: "[VAR]"
2020
pointer: "true"
2121
const: "false"
2222

23-
- name: "m"
23+
- name: "M"
2424
type: "size_t"
2525
pointer: "false"
2626
constant: "false"
2727

28-
- name: "n"
28+
- name: "N"
2929
type: "size_t"
3030
pointer: "false"
3131
constant: "false"
3232

33-
- name: "k"
33+
- name: "K"
3434
type: "size_t"
3535
pointer: "false"
3636
constant: "false"
3737
---
3838

39-
Perform fused matrix multiplication followed by sigmoid activation followed by summation:
39+
Perform a matrix multiplication followed by sigmoid activation followed by summation:
4040

4141
$$
4242
\text{result} = \sum_{i=0}^{M-1} \sum_{j=0}^{N-1} \sigma\left(\sum_{k=0}^{K-1} A[i][k] \cdot B[k][j]\right)
@@ -54,9 +54,7 @@ This operation consists of three steps:
5454
- Matrix $B$ of size $K \times N$
5555

5656
## Output
57-
- Scalar value representing the sum of sigmoid(A * B)
57+
- Scalar value `output` representing the sum of $\sigma(AB)$
5858

5959
## Notes:
60-
- All matrices $A$ and $B$ are stored in row-major order
61-
- The fusion of these operations can provide significant performance benefits by reducing memory bandwidth
62-
- Consider optimizing memory access patterns and reducing intermediate results
60+
- The matrices $A$ and $B$ are stored in row-major order
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,8 @@ def get_function_signature(self) -> Dict[str, Any]:
164164
"argtypes": [
165165
ctypes.POINTER(ctypes.c_float), # matrix_a
166166
ctypes.POINTER(ctypes.c_float), # matrix_b
167-
ctypes.POINTER(ctypes.c_float), # output_matrix
168167
ctypes.c_float, # scale factor
168+
ctypes.POINTER(ctypes.c_float), # output_matrix
169169
ctypes.c_size_t, # M (rows in A)
170170
ctypes.c_size_t, # N (columns in B)
171171
ctypes.c_size_t # K (columns in A, rows in B)
@@ -203,5 +203,4 @@ def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
203203
List containing the scale factor and dimensions M, N, K
204204
"""
205205
M, N, K = test_case["dims"]
206-
scale = test_case["scale"]
207-
return [scale, M, N, K]
206+
return [M, N, K]

0 commit comments

Comments
 (0)