Added cpu memory budget to the frontend

cehongwang · cehongwang · commit 7f0e504dc2e0 · 2025-11-05T21:23:47.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -105,6 +105,7 @@ def cross_compile_for_windows(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -179,6 +180,7 @@ def cross_compile_for_windows(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        cpu_memory_budget (int): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail. If set to -1, the compilation will use all available CPU memory.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -334,6 +336,7 @@ def cross_compile_for_windows(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "cpu_memory_budget": cpu_memory_budget,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -435,6 +438,7 @@ def compile(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -681,6 +685,7 @@ def compile(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "cpu_memory_budget": cpu_memory_budget,
     }
     logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
     settings = CompilationSettings(**compilation_options)
@@ -833,6 +838,7 @@ def preserve_module_specs(
                 torch_executed_ops=settings.torch_executed_ops,
                 require_full_compilation=settings.require_full_compilation,
                 skip_fusion=(num_supported_ops == total_ops),
+                cpu_memory_budget=settings.cpu_memory_budget,
             )
 
         except torch.fx.passes.splitter_base.FxNetSplitterInternalError:
@@ -878,11 +884,10 @@ def preserve_module_specs(
         if attr.startswith("_frozen_param"):
             delattr(gm, attr)
 
-
-
     from torch_tensorrt.dynamo.conversion._ConverterRegistry import DYNAMO_CONVERTERS
+
     DYNAMO_CONVERTERS.disallowed_targets = set()
-    
+
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -1071,6 +1076,7 @@ def convert_exported_program_to_serialized_trt_engine(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1345,7 +1351,7 @@ def convert_exported_program_to_serialized_trt_engine(
             )
 
     flattened_input_list = get_flat_args_with_check(
-        exported_program, list(trt_arg_inputs), trt_kwarg_inputs  # type: ignore
+        exported_program, list(trt_arg_inputs), trt_kwarg_inputs
     )[0]
 
     try:
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -57,6 +57,7 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+CPU_MEMORY_BUDGET = -1
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -7,6 +7,7 @@
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
     CACHE_BUILT_ENGINES,
+    CPU_MEMORY_BUDGET,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
@@ -140,6 +141,7 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    cpu_memory_budget: int = CPU_MEMORY_BUDGET
 
     def __getstate__(self) -> dict[str, Any]:
         from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -118,6 +118,7 @@ def __init__(
         require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
         return_tuple: bool = False,
         skip_fusion: bool = False,
+        cpu_memory_budget: int = -1,
     ):
         """
         Preprocesses graph before splitting:
@@ -137,6 +138,7 @@ def __init__(
             skip_fusion=skip_fusion,
         )
         self.operator_support = operator_support
+        self.cpu_memory_budget = cpu_memory_budget
 
         # Get all accelerated nodes based on operator support conditions
         self.acc_nodes = FxNetAccNodesFinder(
@@ -231,19 +233,15 @@ def partition_graph(self) -> torch.fx.GraphModule:
         subgraphs = self.remove_small_acc_subgraphs(subgraphs)
 
         subgraphs = self.break_subgraphs(
-            subgraphs, size_budget=self.calculate_size_budget()
+            subgraphs, subgraph_size_budget=self.calculate_size_budget()
         )
 
         # Set the number of TRT engines to be generated
         self.num_trt_accelerated_subgraphs = len([s for s in subgraphs if s.is_acc])
 
         # Tag the accelerated nodes and split the graph accordingly
-        print([len(s.nodes) for s in subgraphs])
         self.tag(subgraphs)
 
-        for s in subgraphs:
-            print(s.nodes)
-
         gm = self.split()
 
         return gm
@@ -255,8 +253,11 @@ def calculate_size_budget(
         This function calculates the size budget based on the available RSS. We assume that TRT compilation
         needs at most 4x the memory of the model.
         """
-
-        available_rss: int = psutil.virtual_memory().available
+        if self.cpu_memory_budget == -1:
+            available_rss: int = psutil.virtual_memory().available
+        else:
+            used_rss: int = psutil.virtual_memory().used
+            available_rss = self.cpu_memory_budget - used_rss
         return available_rss // engine_compilation_memory_usage_multiplier
 
     def break_subgraphs_by_node(
@@ -303,7 +304,7 @@ def break_subgraphs_by_node(
         return new_subgraphs
 
     def break_subgraphs(
-        self, subgraphs: List[Subgraph], size_budget: int
+        self, subgraphs: List[Subgraph], subgraph_size_budget: int
     ) -> List[Subgraph]:
         """
         This function breaks the subgraphs into smaller subgraphs to save CPU memory.
@@ -312,15 +313,16 @@ def break_subgraphs(
         # We throw an error if the remaining memory is almost empty compared to the model size.
         # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
         sizes = [(subgraph, self.size_of_subgraph(subgraph)) for subgraph in subgraphs]
-        if sum([size for _, size in sizes]) > size_budget * 40:
+        if sum([size for _, size in sizes]) > subgraph_size_budget * 40:
             raise ValueError(
-                f"Subgraph size {sum([size for _, size in sizes])} is too large to break. Size budget: {size_budget}"
+                f"CPU memory budget or available memory is too small to compile the model. CPU memory budget: {self.cpu_memory_budget // (1024 * 1024) if self.cpu_memory_budget != -1 else "All available memory"} MB, Model size: {sum([size for _, size in sizes]) // (1024 * 1024)} MB. "
+                + "Consider setting cpu_memory_budget to a larger value or disable offload_module_to_cpu to save more CPU memory."
             )
         for subgraph, size in sizes:
 
-            while size > size_budget:
+            while size > subgraph_size_budget:
                 broken_subgraphs, size_0, size_1 = self.break_subgraph_by_size(
-                    subgraph, size_budget
+                    subgraph, subgraph_size_budget
                 )
                 size = size_1
                 new_subgraphs.append(broken_subgraphs[0])
@@ -541,6 +543,7 @@ def partition(
     torch_executed_ops: Collection[Target] = set(),
     require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     skip_fusion: bool = False,
+    cpu_memory_budget: int = -1,
 ) -> Tuple[torch.fx.GraphModule, OpSupportTester]:
     """Partition an FX GraphModule with aten ops into TRT engines
     Partitioning is based on converter operator support
@@ -567,6 +570,7 @@ def partition(
         min_block_size=min_block_size,
         require_full_compilation=require_full_compilation,
         skip_fusion=skip_fusion,
+        cpu_memory_budget=cpu_memory_budget,
     )
 
     partitioned_graph = partitioner.partition_graph()