add more fused problems (#36)

knightron0 · web-flow · commit 4e838fdcc0b9 · 2025-09-07T15:46:56.000-04:00
* add problems

* fix
diff --git a/problems/box-blur/problem.md b/problems/box-blur/problem.md
@@ -52,6 +52,3 @@ This creates a blurring effect by smoothing out pixel values. The larger the ker
 ## Notes:
 - The input tensor is a single-channel grayscale image
 - Handle edge cases by only averaging available pixels (no padding)
-- For pixels near the border, use a smaller effective kernel
-- The kernel size should be odd (3x3, 5x5, 7x7, etc.)
-- This is a fundamental operation in computer graphics and image processing 
diff --git a/problems/conv2d-relu-hardswish/def.py b/problems/conv2d-relu-hardswish/def.py
@@ -25,33 +25,20 @@ def reference_solution(self, input_image: torch.Tensor, kernel: torch.Tensor) ->
             Result of conv2d -> ReLU -> HardSwish fusion
         """
         with torch.no_grad(), torch.autocast("cuda", enabled=False, dtype=torch.float32):
-            # Ensure kernel sizes are odd
-            assert kernel.size(0) % 2 == 1, "Kernel height must be odd"
-            assert kernel.size(1) % 2 == 1, "Kernel width must be odd"
-            
-            # Perform 2D convolution using PyTorch's built-in function
-            # Convert to shape expected by conv2d: [batch, channels, height, width]
             input_reshaped = input_image.view(1, 1, input_image.size(0), input_image.size(1))
             kernel_reshaped = kernel.view(1, 1, kernel.size(0), kernel.size(1))
             
-            # Calculate padding size to maintain the same output size
             padding_h = kernel.size(0) // 2
             padding_w = kernel.size(1) // 2
             
-            # Perform convolution
             conv_result = torch.nn.functional.conv2d(
                 input_reshaped, 
                 kernel_reshaped, 
                 padding=(padding_h, padding_w)
             )
             
-            # Reshape back to original dimensions
             conv_result = conv_result.view(input_image.size(0), input_image.size(1))
-            
-            # Apply ReLU activation
-            relu_result = torch.nn.functional.relu(conv_result)
-            
-            # Apply HardSwish activation: x * ReLU6(x + 3) / 6
+            relu_result = torch.nn.functional.relu(conv_result)            
             hardswish_result = relu_result * torch.nn.functional.relu6(relu_result + 3) / 6
             
             return hardswish_result
@@ -94,7 +81,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
         Returns:
             A test case dictionary
         """
-        h, w, kh, kw = (4, 4, 3, 3) # Sample configuration (kernel dims must be odd)
+        h, w, kh, kw = (4, 4, 3, 3)
         return {
             "name": f"H={h}, W={w}, Kh={kh}, Kw={kw}",
             "height": h,
@@ -190,9 +177,6 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         Returns:
             Number of floating point operations
         """
-        # Conv2D FLOPS = 2 * H * W * Kh * Kw
-        # ReLU FLOPS = 0 (comparison operation only)
-        # HardSwish FLOPS = 5 * H * W (add, relu6, mul, div, mul)
         H = test_case["height"]
         W = test_case["width"]
         Kh = test_case["kernel_height"]
diff --git a/problems/conv2d-relu-hardswish/problem.md b/problems/conv2d-relu-hardswish/problem.md
@@ -0,0 +1,66 @@
+---
+slug: "conv2d-relu-hardswish"
+title: "2D Convolution with ReLU and HardSwish"
+difficulty: "MEDIUM"
+author: "sarthak"
+tags: ["convolution", "activation-function", "fused"]
+parameters:
+  - name: "image"
+    type: "[VAR]"
+    pointer: "true"
+    const: "true"
+  
+  - name: "kernel"
+    type: "[VAR]"
+    pointer: "true"
+    const: "true"
+
+  - name: "output" 
+    type: "[VAR]"
+    pointer: "true"
+    const: "false"
+
+  - name: "H"
+    type: "size_t"
+    pointer: "false"
+    constant: "false"
+    
+  - name: "W" 
+    type: "size_t"
+    pointer: "false"
+    constant: "false"
+    
+  - name: "Kh"
+    type: "size_t"
+    pointer: "false"
+    constant: "false"
+    
+  - name: "Kw"
+    type: "size_t"
+    pointer: "false"
+    constant: "false"
+---
+
+Perform a 2D convolution followed by ReLU activation followed by HardSwish activation:
+
+1. **2D Convolution**: 
+   $$C[i][j] = \sum_{u=0}^{K_h-1} \sum_{v=0}^{K_w-1} I\left[i+u-\frac{K_h-1}{2}\right]\left[j+v-\frac{K_w-1}{2}\right] \cdot K[u][v]$$
+
+2. **ReLU Activation**:
+   $$R[i][j] = \max(0, C[i][j])$$
+
+3. **HardSwish Activation**:
+   $$O[i][j] = R[i][j] \cdot \frac{\text{ReLU6}(R[i][j] + 3)}{6}$$
+
+where $\text{ReLU6}(x) = \min(6, \max(0, x))$.
+
+## Input
+- `image` of size $H \times W$
+- `kernel` of size $K_h \times K_w$ (both dimensions must be odd)
+
+## Output
+- `output` of size $H \times W$
+
+## Notes:
+- Use zero padding at the boundaries where the kernel extends beyond the input image
+- Both kernel height $K_h$ and kernel width $K_w$ will be odd integers
diff --git a/problems/edge-detect/problem.md b/problems/edge-detect/problem.md
@@ -55,5 +55,3 @@ This algorithm computes horizontal and vertical gradients, then combines them to
 - The input tensor is a single-channel grayscale image
 - Only compute gradients for interior pixels (ignore 1-pixel border)
 - Border pixels remain zero in the output
-- The output shows edge strength - higher values indicate stronger edges
-- This is a fundamental operation in computer vision and image analysis 
diff --git a/problems/gemm-multiply-leakyrelu/def.py b/problems/gemm-multiply-leakyrelu/def.py
@@ -16,24 +16,13 @@ def __init__(self):
     def reference_solution(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, alpha: float) -> torch.Tensor:
         """
         PyTorch implementation of GEMM followed by element-wise multiplication followed by LeakyReLU.
-        
-        Args:
-            A: First input matrix
-            B: Second input matrix
-            C: Element-wise multiplication matrix
-            alpha: LeakyReLU slope parameter
-            
+                    
         Returns:
             Result of LeakyReLU(GEMM(A, B) * C)
         """
         with torch.no_grad(), torch.autocast("cuda", enabled=False, dtype=torch.float32):
-            # GEMM operation: A @ B
             gemm_result = torch.matmul(A, B)
-            
-            # Element-wise multiplication with C
             multiply_result = gemm_result * C
-            
-            # LeakyReLU activation
             leaky_relu_result = torch.nn.functional.leaky_relu(multiply_result, alpha)
             
             return leaky_relu_result
@@ -45,8 +34,6 @@ def generate_test_cases(self, dtype: torch.dtype) -> List[Dict[str, Any]]:
         Returns:
             List of test case dictionaries with varying matrix dimensions
         """
-        # Matrix dimensions: (M, K) × (K, N) = (M, N)
-        # dims represents (M, N, K)
         test_matrices = [
             {
                 "name": "512x512 x 512x512",
@@ -92,7 +79,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
         Returns:
             Dictionary containing the sample test case.
         """
-        m_dims = (4, 4, 4)  # M, N, K dimensions
+        m_dims = (4, 4, 4)
         alpha = 0.01
         return {
             "name": "4x4_square",
@@ -218,5 +205,4 @@ def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
             List containing the alpha value and dimensions M, N, K
         """
         M, N, K = test_case["dims"]
-        alpha = test_case["alpha"]
-        return [alpha, M, N, K] 
+        return [M, N, K] 
diff --git a/problems/gemm-multiply-leakyrelu/problem.md b/problems/gemm-multiply-leakyrelu/problem.md
@@ -1,21 +1,21 @@
 ---
 slug: "gemm-multiply-leakyrelu"
-title: "GEMM + Element-wise Multiply + LeakyReLU Fusion"
-difficulty: "HARD"
+title: "GEMM with Element-wise Multiply and LeakyReLU"
+difficulty: "MEDIUM"
 author: "sarthak"
-tags: ["gemm", "multiply", "leakyrelu", "fusion"]
+tags: ["matmul", "activation-function", "fused"]
 parameters:
-  - name: "matrix_a"
+  - name: "A"
     type: "[VAR]"
     pointer: "true"
     const: "true"
   
-  - name: "matrix_b"
+  - name: "B"
     type: "[VAR]"
     pointer: "true"
     const: "true"
 
-  - name: "matrix_c"
+  - name: "C"
     type: "[VAR]"
     pointer: "true"
     const: "true"
@@ -25,28 +25,28 @@ parameters:
     pointer: "false"
     constant: "false"
 
-  - name: "output_matrix" 
+  - name: "output" 
     type: "[VAR]"
     pointer: "true"
     const: "false"
 
-  - name: "m"
+  - name: "M"
     type: "size_t"
     pointer: "false"
     constant: "false"
     
-  - name: "n" 
+  - name: "N" 
     type: "size_t"
     pointer: "false"
     constant: "false"
     
-  - name: "k"
+  - name: "K"
     type: "size_t"
     pointer: "false"
     constant: "false"
 ---
 
-Perform fused GEMM (General Matrix Multiplication) followed by element-wise multiplication followed by LeakyReLU activation:
+Perform a GEMM (General Matrix Multiplication) followed by element-wise multiplication followed by LeakyReLU activation:
 
 $$
 O[i][j] = \text{LeakyReLU}\left(\left(\sum_{k=0}^{K-1} A[i][k] \cdot B[k][j]\right) \cdot C[i][j], \alpha\right)
@@ -63,13 +63,10 @@ This operation consists of three steps:
 - Matrix $A$ of size $M \times K$
 - Matrix $B$ of size $K \times N$  
 - Matrix $C$ of size $M \times N$ (for element-wise multiplication)
-- Slope parameter $\alpha$ for LeakyReLU
+- $\alpha$ for LeakyReLU
 
 ## Output
 - Matrix $O$ of size $M \times N$
 
 ## Notes:
-- All matrices $A$, $B$, $C$, and $O$ are stored in row-major order
-- LeakyReLU allows small negative values to pass through, preventing dying neurons
-- The fusion of these operations can significantly reduce memory bandwidth requirements
-- Consider optimizing for different values of $\alpha$ (typically small positive values like 0.01) 
+- All matrices $A$, $B$, $C$, and $O$ are stored in row-major order
diff --git a/problems/histogram/problem.md b/problems/histogram/problem.md
@@ -52,6 +52,4 @@ This creates a frequency distribution showing how many pixels have each intensit
 ## Notes:
 - The input tensor contains integer pixel values in range [0, num_bins-1]
 - Each histogram bin counts pixels with that specific intensity value
-- The sum of all histogram bins equals the total number of pixels
-- Histograms are fundamental for image analysis, enhancement, and segmentation
-- Handle potential race conditions when multiple threads access the same bin 
+- The sum of all histogram bins equals the total number of pixels
diff --git a/problems/matmul-sigmoid-sum/def.py b/problems/matmul-sigmoid-sum/def.py
@@ -6,7 +6,7 @@
 
 
 class matmul_sigmoid_sum(Problem):
-    """Matrix multiplication followed by sigmoid activation followed by summation fusion problem."""
+    """Matrix multiplication followed by sigmoid activation followed by summation problem."""
     
     def __init__(self):
         super().__init__(
@@ -31,13 +31,11 @@ def reference_solution(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
     
     def generate_test_cases(self, dtype: torch.dtype) -> List[Dict[str, Any]]:
         """
-        Generate test cases for matmul-sigmoid-sum fusion.
+        Generate test cases for matmul-sigmoid-sum.
         
         Returns:
             List of test case dictionaries with varying matrix dimensions
         """
-        # Matrix dimensions: (M, K) × (K, N) = (M, N)
-        # dims represents (M, N, K)
         test_matrices = [
             {
                 "name": "512x512 x 512x512",
@@ -76,7 +74,7 @@ def generate_sample(self, dtype: torch.dtype = torch.float32) -> Dict[str, Any]:
         Returns:
             Dictionary containing the sample test case.
         """
-        m_dims = (4, 4, 4)  # M, N, K dimensions
+        m_dims = (4, 4, 4) 
         return {
             "name": "4x4_square",
             "dims": m_dims,
diff --git a/problems/matmul-sigmoid-sum/problem.md b/problems/matmul-sigmoid-sum/problem.md
@@ -1,42 +1,42 @@
 ---
 slug: "matmul-sigmoid-sum"
-title: "Matrix Multiplication + Sigmoid + Sum Fusion"
-difficulty: "HARD"
+title: "Matrix Multiplication with Sigmoid and Sum"
+difficulty: "MEDIUM"
 author: "sarthak"
-tags: ["matmul", "sigmoid", "sum", "fusion"]
+tags: ["matmul", "reduction", "fused"]
 parameters:
-  - name: "input_a"
+  - name: "A"
     type: "[VAR]"
     pointer: "true"
     const: "true"
   
-  - name: "input_b"
+  - name: "B"
     type: "[VAR]"
     pointer: "true"
     const: "true"
 
-  - name: "output_result" 
+  - name: "output" 
     type: "[VAR]"
     pointer: "true"
     const: "false"
 
-  - name: "m"
+  - name: "M"
     type: "size_t"
     pointer: "false"
     constant: "false"
     
-  - name: "n" 
+  - name: "N" 
     type: "size_t"
     pointer: "false"
     constant: "false"
     
-  - name: "k"
+  - name: "K"
     type: "size_t"
     pointer: "false"
     constant: "false"
 ---
 
-Perform fused matrix multiplication followed by sigmoid activation followed by summation:
+Perform a matrix multiplication followed by sigmoid activation followed by summation:
 
 $$
 \text{result} = \sum_{i=0}^{M-1} \sum_{j=0}^{N-1} \sigma\left(\sum_{k=0}^{K-1} A[i][k] \cdot B[k][j]\right)
@@ -54,9 +54,7 @@ This operation consists of three steps:
 - Matrix $B$ of size $K \times N$
 
 ## Output
-- Scalar value representing the sum of sigmoid(A * B)
+- Scalar value `output` representing the sum of $\sigma(AB)$
 
 ## Notes:
-- All matrices $A$ and $B$ are stored in row-major order
-- The fusion of these operations can provide significant performance benefits by reducing memory bandwidth
-- Consider optimizing memory access patterns and reducing intermediate results 
+- The matrices $A$ and $B$ are stored in row-major order
diff --git a/problems/matmul-swish-scaling/def.py b/problems/matmul-swish-scaling/def.py
@@ -164,8 +164,8 @@ def get_function_signature(self) -> Dict[str, Any]:
             "argtypes": [
                 ctypes.POINTER(ctypes.c_float),  # matrix_a
                 ctypes.POINTER(ctypes.c_float),  # matrix_b
-                ctypes.POINTER(ctypes.c_float),  # output_matrix
                 ctypes.c_float,                  # scale factor
+                ctypes.POINTER(ctypes.c_float),  # output_matrix
                 ctypes.c_size_t,                 # M (rows in A)
                 ctypes.c_size_t,                 # N (columns in B)
                 ctypes.c_size_t                  # K (columns in A, rows in B)
@@ -203,5 +203,4 @@ def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
             List containing the scale factor and dimensions M, N, K
         """
         M, N, K = test_case["dims"]
-        scale = test_case["scale"]
-        return [scale, M, N, K] 
+        return [M, N, K] 
diff --git a/problems/matmul-swish-scaling/problem.md b/problems/matmul-swish-scaling/problem.md
diff --git a/staging/conv2d-relu-hardswish/problem.md b/staging/conv2d-relu-hardswish/problem.md