FlashTP Benchmark, Documentation Updates, and Release Prep (#153)

vbharadwaj-bk · web-flow · commit 8d30ca20088e · 2025-08-14T11:23:08.000-07:00
* Added FlashTP benchmarking.

* Linted.

* Updated documentation with FlashTP benchmarking instructions.

* Minor changes to ensure that docs build.

* Updated changelog for release prep.

* Linted.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,38 @@
 ## Latest Changes
 
+### v0.4.0 (2025-08-14)
+This release adds a benchmark against
+FlashTP, exposes weight reordering functions
+for e3nn compatibility, adds input validation,
+and provides rudimentary support for PyTorch
+automatic mixed precision (AMP). Our fused,
+JIT-compiled kernels exhibit up to 2x speedup
+over FlashTP!
+
+**Added**:
+1. Both `TensorProduct` and `TensorProductConv`
+now have the methods `reoder_weights_from_e3nn`
+and `reorder_weights_to_e3nn`. These convert
+the buffer of trainable weights from / to e3nn's
+canonical ordering. See the API page for usage
+details.
+2. If you have FlashTP installed, see our 
+documentation ("Tests and Benchmarks" page) 
+to benchmark FlashTP against OpenEquivariance. 
+3. Tensor product inputs with incorrect sizes or 
+datatypes now trigger clear errors in advance of
+execution.
+4. OpenEquivariance now has some support for
+automatic mixed precision (AMP), but only if 
+`TensorProduct` / `TensorProductConv` objects 
+are constructed with `float32` precision for
+both `irrep_dtype` and `weight_dtype`.
+
+**Fixed / Enhanced**:
+1. Added additional fake functions to remove 
+warnings from TorchBind.
+2. Removed bloat from benchmarking code. 
+
 ### v0.3.0 (2025-06-22)
 This release includes bugfixes and new opaque operations that
 compose with `torch.compile` 
diff --git a/docs/supported_ops.rst b/docs/supported_ops.rst
@@ -117,4 +117,7 @@ toplevel. You can use our implementation by running
 
 .. code-block::
 
-    from openequivariance.implementations.symmetric_contraction import SymmetricContraction as OEQSymmetricContraction
+    from openequivariance.implementations.symmetric_contraction import SymmetricContraction as OEQSymmetricContraction
+
+Some Github users report weak performance for the
+symmetric contraction backward pass; your mileage may vary.
diff --git a/docs/tests_and_benchmarks.rst b/docs/tests_and_benchmarks.rst
@@ -67,6 +67,16 @@ For GPUs besides the NVIDIA A100, the roofline slope / peak will be incorrect.
 The plots for the convolution fusion experiments also require a GPU 
 with a minimum of 40GB of memory. 
 
+We recently added a benchmark against
+`FlashTP <https://github.com/SNU-ARC/flashTP>`_. To replicate it
+on your system, install FlashTP via ``pip`` and run 
+
+.. code-block:: bash
+
+    python tests/benchmark.py -o outputs/conv conv --plot --data data/molecular_structures -i cue_unfused oeq_scattersum flashtp cue_fused oeq_det oeq_atomic
+
+OpenEquivariance exhibits up to 2x speedup over FlashTP's fused kernels. 
+
 List of GPUs Tested
 --------------------------------
 OpenEquivariance has been tested successfully the following GPUs. Submit a pull 
diff --git a/openequivariance/benchmark/plotting/plot_convolution.py b/openequivariance/benchmark/plotting/plot_convolution.py
@@ -19,13 +19,8 @@ def plot_convolution(data_folder):
     data_folder = pathlib.Path(data_folder)
     benchmarks, metadata = load_benchmarks(data_folder)
 
-    implementations = [
-        "CUEConvolution",
-        "CUEConvolutionFused",
-        "LoopUnrollConvScatterSum",
-        "LoopUnrollConvAtomic",
-        "LoopUnrollConvDeterministic",
-    ]
+    implementations = metadata["implementations"]
+    assert "CUEConvolution" in implementations
 
     graphs = ["1drf_radius6.0", "covid_spike_radius3.0", "carbon_lattice_radius6.0"]
     graph_lmap = {
@@ -81,7 +76,7 @@ def plot_convolution(data_folder):
                 rotate_xlabels=True,
                 colormap=colormap,
                 hatchmap=hatchmap,
-                group_spacing=6.0,
+                group_spacing=7.0,
             )
 
             axes[i][j].set_xlabel(dtype_labelmap[dtype])
diff --git a/openequivariance/benchmark/plotting/plotting_utils.py b/openequivariance/benchmark/plotting/plotting_utils.py
@@ -392,8 +392,9 @@ def set_size(w, h, ax=None):
     "CUEConvolutionFused": "cuE-fused",
     "LoopUnrollConvDeterministic": "fast-fused-det",
     "LoopUnrollConvAtomic": "fast-fused-atomic",
+    "FlashTPConv": "flashtp",
 }
-colormap = {"e3nn": "lightblue", "cuE": "orange", "ours": "g"}
+colormap = {"e3nn": "lightblue", "cuE": "orange", "ours": "g", "flashtp": "purple"}
 
 for key in ["fast-scattersum", "fast-fused-det", "fast-fused-atomic"]:
     colormap[key] = colormap["ours"]
diff --git a/openequivariance/implementations/convolution/FlashTPConv.py b/openequivariance/implementations/convolution/FlashTPConv.py
@@ -0,0 +1,44 @@
+__all__ = [
+    "FlashTPConv",
+]
+
+import torch
+import numpy as np
+from openequivariance.implementations.convolution.ConvolutionBase import ConvolutionBase
+from openequivariance.implementations.utils import oeq_to_torch_dtype
+
+
+class FlashTPConv(ConvolutionBase):
+    def __init__(self, config, *, idx_dtype=np.int64, torch_op=True):
+        super().__init__(config, idx_dtype=idx_dtype, torch_op=torch_op)
+        from flashTP_e3nn import uvu_TP
+
+        instructions = [
+            (
+                inst.i_in1,
+                inst.i_in2,
+                inst.i_out,
+                inst.connection_mode,
+                inst.has_weight,
+                inst.path_weight,
+            )
+            for inst in config.instructions
+        ]
+
+        self.internal = uvu_TP(
+            config.irreps_in1,
+            config.irreps_in2,
+            config.irreps_out,
+            instructions,
+            device="cuda",
+            dtype=oeq_to_torch_dtype(config.irrep_dtype),
+        )
+
+    def forward(self, L1_in, L2_in, weights, rows, cols, transpose_perm=None):
+        return self.internal(
+            L1_in, L2_in, weights, rows.to(torch.int), cols.to(torch.int)
+        )
+
+    @staticmethod
+    def name():
+        return "FlashTPConv"
diff --git a/openequivariance/implementations/dtype_enum.py b/openequivariance/implementations/dtype_enum.py
@@ -1,5 +1,4 @@
 from enum import IntEnum
-from typing import Mapping
 from types import MappingProxyType
 import numpy as np
 import torch
@@ -13,33 +12,31 @@ class DTypeEnum(IntEnum):
     UINT8 = 5
 
 
-dtype_to_enum: Mapping[torch.dtype | type[np.generic] | np.dtype, DTypeEnum] = (
-    MappingProxyType(
-        {
-            torch.float32: DTypeEnum.FLOAT32,
-            torch.float64: DTypeEnum.FLOAT64,
-            torch.int32: DTypeEnum.INT32,
-            torch.int64: DTypeEnum.INT64,
-            torch.uint8: DTypeEnum.UINT8,
-            # torch
-            np.float32: DTypeEnum.FLOAT32,
-            np.float64: DTypeEnum.FLOAT64,
-            np.int32: DTypeEnum.INT32,
-            np.int64: DTypeEnum.INT64,
-            np.uint8: DTypeEnum.UINT8,
-            # numpy generic
-            np.dtype(np.float32): DTypeEnum.FLOAT32,
-            np.dtype(np.float64): DTypeEnum.FLOAT64,
-            np.dtype(np.int32): DTypeEnum.INT32,
-            np.dtype(np.int64): DTypeEnum.INT64,
-            np.dtype(np.uint8): DTypeEnum.UINT8,
-            # numpy dtype
-        }
-    )
+dtype_to_enum = MappingProxyType(
+    {
+        torch.float32: DTypeEnum.FLOAT32,
+        torch.float64: DTypeEnum.FLOAT64,
+        torch.int32: DTypeEnum.INT32,
+        torch.int64: DTypeEnum.INT64,
+        torch.uint8: DTypeEnum.UINT8,
+        # torch
+        np.float32: DTypeEnum.FLOAT32,
+        np.float64: DTypeEnum.FLOAT64,
+        np.int32: DTypeEnum.INT32,
+        np.int64: DTypeEnum.INT64,
+        np.uint8: DTypeEnum.UINT8,
+        # numpy generic
+        np.dtype(np.float32): DTypeEnum.FLOAT32,
+        np.dtype(np.float64): DTypeEnum.FLOAT64,
+        np.dtype(np.int32): DTypeEnum.INT32,
+        np.dtype(np.int64): DTypeEnum.INT64,
+        np.dtype(np.uint8): DTypeEnum.UINT8,
+        # numpy dtype
+    }
 )
 
 
-enum_to_torch_dtype: Mapping[DTypeEnum, torch.dtype] = MappingProxyType(
+enum_to_torch_dtype = MappingProxyType(
     {
         DTypeEnum.FLOAT32: torch.float32,
         DTypeEnum.FLOAT64: torch.float64,
diff --git a/openequivariance/implementations/utils.py b/openequivariance/implementations/utils.py
@@ -106,6 +106,18 @@ def torch_to_oeq_dtype(torch_dtype) -> type[np.generic]:
         raise ValueError("Unsupported torch dtype!")
 
 
+def oeq_to_torch_dtype(oeq_dtype: type[np.generic]):
+    global torch
+    import torch
+
+    if oeq_dtype == np.float32:
+        return torch.float32
+    elif oeq_dtype == np.float64:
+        return torch.float64
+    else:
+        raise ValueError("Unsupported numpy dtype!")
+
+
 def benchmark(func, num_warmup, num_iter, mode="gpu_time", kernel_names=[]):
     """
     mode=gpu_time may include PyTorch overhead
diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -38,6 +38,7 @@
 )
 
 from openequivariance.implementations.convolution.CUEConv import CUEConv, CUEConvFused
+from openequivariance.implementations.convolution.FlashTPConv import FlashTPConv
 from openequivariance.benchmark.ConvBenchmarkSuite import ConvBenchmarkSuite, load_graph
 
 from openequivariance.benchmark.problems import (
@@ -54,13 +55,22 @@
 CTPP = ChannelwiseTPP
 FCTPP = FullyConnectedTPProblem
 
-implementation_map = {
+implementation_map_tp = {
     "e3nn": E3NNTensorProductCompiledMaxAutotuneCUDAGraphs,
     "e3nn_uncompiled": E3NNTensorProduct,
     "cue": CUETensorProduct,
     "oeq": TensorProduct,
 }
 
+implementation_map_conv = {
+    "cue_unfused": CUEConv,
+    "oeq_scattersum": TensorProductConvScatterSum,
+    "flashtp": FlashTPConv,
+    "cue_fused": CUEConvFused,
+    "oeq_det": TensorProductConvDeterministic,
+    "oeq_atomic": TensorProductConvAtomic,
+}
+
 datatype_map = {"float32": np.float32, "float64": np.float64}
 
 roofline_configs = [
@@ -87,7 +97,7 @@ def benchmark_uvu(params):
         problem.weight_dtype = np.float64
     problems = mace_problems() + nequip_problems() + float64_problems
 
-    implementations = [implementation_map[impl] for impl in params.implementations]
+    implementations = [implementation_map_tp[impl] for impl in params.implementations]
     directions = params.directions
 
     tests = [
@@ -289,11 +299,7 @@ def benchmark_convolution(params):
         bench = ConvBenchmarkSuite(configs, test_name="convolution")
 
         implementations = [
-            TensorProductConvScatterSum,
-            CUEConv,
-            CUEConvFused,
-            TensorProductConvDeterministic,
-            TensorProductConvAtomic,
+            implementation_map_conv[impl] for impl in params.implementations
         ]
 
         if params.limited_memory:
@@ -496,6 +502,16 @@ def plot(params):
         help="Disable tests requiring large amounts of memory.",
     )
     parser_conv.add_argument("--plot", action="store_true", help="Plot the results.")
+    parser_conv.add_argument(
+        "--implementations",
+        "-i",
+        type=str,
+        nargs="+",
+        default=["cue_unfused", "oeq_scattersum", "cue_fused", "oeq_atomic", "oeq_det"],
+        help="Implementations to benchmark",
+        choices=list(implementation_map_conv.keys()),
+    )
+
     parser_conv.set_defaults(func=benchmark_convolution)
 
     parser_uvw = subparsers.add_parser(