diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h index d851cda07e..0e1dd7e4fb 100644 --- a/core/runtime/TRTEngine.h +++ b/core/runtime/TRTEngine.h @@ -181,7 +181,6 @@ struct TRTEngine : torch::CustomClassHolder { int64_t get_streamable_device_memory_budget(); int64_t get_automatic_device_memory_budget(); std::vector infer_outputs(std::vector> input_shapes); - void set_pre_allocated_outputs(bool enable); void set_output_tensors_as_unowned(bool enable); bool are_output_tensors_unowned(); TorchTRTRuntimeStates runtime_states; diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index e9ceff2a3e..abdfc213cb 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -101,6 +101,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = : TRTEngine::ResourceAllocationStrategy::kStatic); }) .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs) + .def_readwrite("pre_allocated_outputs", &TRTEngine::pre_allocated_outputs) .def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs) .def_property( "device_memory_budget", diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h index e3a675cb05..f7c6aee18f 100644 --- a/core/runtime/runtime.h +++ b/core/runtime/runtime.h @@ -43,6 +43,8 @@ typedef enum { SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO } SerializedInfoIndex; +// For adding new serialized info indices, above and update /dynamo/runtime/_serialized_engine_layout.py + std::string base64_encode(const std::string& in); std::string base64_decode(const std::string& in); std::string serialize_bindings(const std::vector& bindings); diff --git a/docsrc/contributors/complex_number_support.rst b/docsrc/contributors/complex_number_support.rst index f4fbe96f70..84abbc74de 100644 --- a/docsrc/contributors/complex_number_support.rst +++ b/docsrc/contributors/complex_number_support.rst @@ -128,9 +128,8 @@ runtime modules handle the conversion: * ``prepare_inputs`` (``dynamo/utils.py``) — builds the ``Input`` spec with the ``view_as_real`` shape/dtype but retains the original complex tensor in ``inp.torch_tensor`` for tracing. -* ``_PythonTorchTensorRTModule.forward`` — applies ``torch.view_as_real(i).contiguous()`` - for each complex input before feeding it to the engine. -* ``_TorchTensorRTModule.forward`` — same ``view_as_real`` conversion. +* ``TorchTensorRTModule.forward`` — applies ``torch.view_as_real(i).contiguous()`` + for each complex input before feeding tensors to ``execute_engine`` / ``execute_engine_python``. Key Implementation Invariants ------------------------------- diff --git a/docsrc/contributors/cuda_graphs.rst b/docsrc/contributors/cuda_graphs.rst index 08940fd8e2..6c2369c748 100644 --- a/docsrc/contributors/cuda_graphs.rst +++ b/docsrc/contributors/cuda_graphs.rst @@ -93,8 +93,8 @@ Subsequent inference launches the instantiated graph instead of calling Graph Storage ^^^^^^^^^^^^^ -Each runtime module (both C++ ``TorchTensorRTModule`` and Python -``PythonTorchTensorRTModule``) stores a ``cudaGraphExec_t`` instance. When +``TorchTensorRTModule`` (C++ or Python execution path) may record a CUDA graph for +engine execution when CUDA graphs are enabled at runtime. When ``use_cuda_graph=True`` is set at compile time the runtime records one graph per engine for the first input shape encountered. diff --git a/docsrc/contributors/runtime.rst b/docsrc/contributors/runtime.rst index 0684d8811f..ddcbc8c746 100644 --- a/docsrc/contributors/runtime.rst +++ b/docsrc/contributors/runtime.rst @@ -10,14 +10,8 @@ infrastructure for inference. Dynamo Runtime (Primary Path) ------------------------------- -Two runtime backends are available. The backend is selected via the -``use_python_runtime`` compilation setting. - -C++ Runtime (default) -^^^^^^^^^^^^^^^^^^^^^^^ - -The C++ runtime is more performant, fully serializable, and supports advanced features -like CUDAGraphs and multi-device safety. +The Dynamo runtime is fully serializable and supports advanced features like +CUDAGraphs and multi-device safety. TensorRT engines are stored as ``torch.classes.tensorrt.Engine`` — a C++ TorchBind class that holds the serialized engine bytes plus metadata: @@ -41,14 +35,6 @@ This op pops inputs and the engine off the PyTorch dispatcher stack, runs the te through TensorRT, and pushes output tensors back. The compiled ``torch.fx.Graph`` stores engine objects as attributes, making the whole module portable. -Python Runtime -^^^^^^^^^^^^^^^ - -The Python runtime uses TensorRT's Python API directly for inference. It is useful when -a C++ build is not available (e.g. in some CI environments) and is simpler to instrument -for debugging. It does not support serialization to ``ExportedProgram``; the compiled -graph is Python-only. - Serialization Options ---------------------- @@ -59,7 +45,7 @@ The default serialization path for the Dynamo AOT workflow. The compiled ``torch.fx.GraphModule`` is wrapped in a `torch.export.ExportedProgram `_ container. TensorRT engines are stored as tensor attributes in the package; PyTrees -capture input/output structure. Requires the C++ runtime and supports Python execution. +capture input/output structure. .. code-block:: python diff --git a/docsrc/debugging/troubleshooting.rst b/docsrc/debugging/troubleshooting.rst index aff1e52e86..68ba01fa2f 100644 --- a/docsrc/debugging/troubleshooting.rst +++ b/docsrc/debugging/troubleshooting.rst @@ -126,8 +126,6 @@ Runtime Errors the engine. Upgrade TRT or rebuild with ``version_compatible=True``. * The GPU compute capability is lower than on the build machine. Rebuild with ``hardware_compatible=True`` (requires Ampere or newer). - * The ``.ep`` file was generated with ``use_python_runtime=True`` which is not - serializable. Rebuild with the default C++ runtime. **Shape mismatch at runtime / "Invalid input shape"** @@ -153,9 +151,9 @@ Runtime Errors The model contains data-dependent-shape ops (``nonzero``, ``unique``, ``masked_select``, etc.) which require TRT's output allocator. - * Use ``PythonTorchTensorRTModule`` (``use_python_runtime=True``) — it - activates the dynamic output allocator automatically via - ``requires_output_allocator=True``. + * Use :class:`~torch_tensorrt.runtime.TorchTensorRTModule` (or a compiled graph that wraps it) + with ``requires_output_allocator=True`` so the runtime can use TRT's output allocator + when the engine needs dynamic output allocation. * See :ref:`cuda_graphs` for ``DynamicOutputAllocator`` details. ---- diff --git a/docsrc/py_api/runtime.rst b/docsrc/py_api/runtime.rst index 719d8f6555..f8c020262d 100644 --- a/docsrc/py_api/runtime.rst +++ b/docsrc/py_api/runtime.rst @@ -27,13 +27,20 @@ Functions .. autofunction:: enable_output_allocator +Runtime backend +--------------- + +Execution uses the C++ runtime engine when it is installed in the build; otherwise the +Python runtime engine is used. There is no separate process-wide backend switch +in ``torch_tensorrt.runtime``. + Classes --------- .. autoclass:: TorchTensorRTModule :members: :special-members: __init__ + :show-inheritance: -.. autoclass:: PythonTorchTensorRTModule - :members: - :special-members: __init__ + Single runtime module for TensorRT engines. Dispatches to the C++ or Python execution + implementation depending on whether the C++ extension is available. See :ref:`python_runtime`. diff --git a/docsrc/tutorials/deployment/cross_compile_windows.rst b/docsrc/tutorials/deployment/cross_compile_windows.rst index d5ac891b7c..90cea60579 100644 --- a/docsrc/tutorials/deployment/cross_compile_windows.rst +++ b/docsrc/tutorials/deployment/cross_compile_windows.rst @@ -26,7 +26,6 @@ Requirements The following features are **disabled** during cross-compilation (they are not available in the Windows TRT runtime or require OS-specific binaries): -* Python runtime (``use_python_runtime`` is forced to ``False``) * Lazy engine initialization (``lazy_engine_init`` is forced to ``False``) * Engine caching (``cache_built_engines`` / ``reuse_cached_engines`` disabled) diff --git a/docsrc/tutorials/deployment/distributed_inference.rst b/docsrc/tutorials/deployment/distributed_inference.rst index f3d89c2d32..54d35272c0 100644 --- a/docsrc/tutorials/deployment/distributed_inference.rst +++ b/docsrc/tutorials/deployment/distributed_inference.rst @@ -108,7 +108,6 @@ inference in ``distributed_context`` for safe NCCL lifecycle management: dynamic=True, options={ "use_distributed_mode_trace": True, - "use_python_runtime": False, "min_block_size": 1, }, ) @@ -630,7 +629,6 @@ Compilation Settings for Distributed Workloads **Auto-enabled** for ``torch.compile`` when ``dist.is_initialized()`` and ``world_size > 1`` — no explicit flag needed. Must be set manually when using ``torch_tensorrt.dynamo.compile()`` directly (e.g. AOT export workflows). - * - ``use_python_runtime`` - ``None`` (auto) - ``False`` (C++ runtime) is recommended for production. The C++ runtime handles NCCL via TRT's native ``DistCollective`` layers. The Python runtime uses diff --git a/docsrc/tutorials/runtime_opt/index.rst b/docsrc/tutorials/runtime_opt/index.rst index b4b3c9af8a..007bd1f645 100644 --- a/docsrc/tutorials/runtime_opt/index.rst +++ b/docsrc/tutorials/runtime_opt/index.rst @@ -2,7 +2,7 @@ Runtime Optimization ===================== Optimize inference throughput and latency: CUDA Graphs for kernel-replay, -pre-allocated output buffers, and the Python runtime module. +pre-allocated output buffers, and choosing the Python vs C++ TRT execution path. .. toctree:: :maxdepth: 1 @@ -10,4 +10,4 @@ pre-allocated output buffers, and the Python runtime module. cuda_graphs Example: Torch Export with Cudagraphs <../_rendered_examples/dynamo/torch_export_cudagraphs> Example: Pre-allocated output buffer <../_rendered_examples/dynamo/pre_allocated_output_example> - python_runtime + Python vs C++ runtime diff --git a/docsrc/tutorials/runtime_opt/python_runtime.rst b/docsrc/tutorials/runtime_opt/python_runtime.rst index 2c97b941d4..3f78bb05e5 100644 --- a/docsrc/tutorials/runtime_opt/python_runtime.rst +++ b/docsrc/tutorials/runtime_opt/python_runtime.rst @@ -1,23 +1,29 @@ .. _python_runtime: -Python Runtime -============== +Python vs C++ runtime +===================== -Torch-TensorRT provides two runtime backends for executing compiled TRT engines -inside a PyTorch graph: +Torch-TensorRT uses a single module type, :class:`~torch_tensorrt.runtime.TorchTensorRTModule`, +to run TensorRT engines inside PyTorch. The **execution path** (which code actually drives +TensorRT execution) is selected automatically: -* **C++ runtime** (default) — ``TorchTensorRTModule`` backed by a C++ TorchBind class. - Fully serializable, supports CUDAGraphs, multi-device safe. -* **Python runtime** — ``PythonTorchTensorRTModule`` backed entirely by the TRT Python - API. Simpler to instrument for debugging but **not serializable** to - ``ExportedProgram``. +* **C++ path** — ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. + Used when the Torch-TensorRT C++ extension (``libtorchtrt`` / runtime ``.so``) is loaded: + TorchScript-friendly, and integrates with the full C++ runtime stack. +* **Python path** — Internal ``TRTEngine`` (``torch_tensorrt.dynamo.runtime._TRTEngine``) + plus ``tensorrt::execute_engine`` registered from Python when the C++ runtime is not + available (use ``PYTHON_ONLY=1`` when building Torch-TensorRT). Useful for minimal installs and for Python-level debugging. + +Both the C++ and Python paths are invoked through the same ``TorchTensorRTModule`` class, +which dispatches to the appropriate runtime engine based on the build of Torch-TensorRT (Full build or PYTHON_ONLY build). ---- -When to Use the Python Runtime --------------------------------- +When the Python runtime is used +----------------------------- -Use ``use_python_runtime=True`` when: +The Python engine implementation is chosen automatically when the C++ Torch-TensorRT library +is not installed (enabled by setting ``PYTHON_ONLY=1`` when building Torch-TensorRT). You may still prefer that setup when: * You need to run on a machine where the C++ Torch-TensorRT library is not installed (e.g., a minimal CI container with only the Python wheel). @@ -27,74 +33,77 @@ Use ``use_python_runtime=True`` when: Use the default C++ runtime in all other cases, especially: -* When saving a compiled module to disk (``torch_tensorrt.save()``). * When using CUDAGraphs for low-latency inference. * In production deployments. - ---- -Enabling the Python Runtime ------------------------------ +Compile and run +----------------- -.. code-block:: python +Use ``torch_tensorrt.dynamo.compile``, ``torch.compile(..., backend="tensorrt", ...)``, or +construct :class:`~torch_tensorrt.runtime.TorchTensorRTModule` directly. The module picks C++ +vs Python execution based on the build of Torch-TensorRT (Full build or Python-only build). - import torch_tensorrt +---- - trt_gm = torch_tensorrt.dynamo.compile( - exported_program, - arg_inputs=inputs, - use_python_runtime=True, - ) +Serialization +--------------- -Or via ``torch.compile``: +``TorchTensorRTModule`` are serializable in both the C++ and Python paths. +.. code-block::python + torch_tensorrt.save(trt_module, trt_ep_path, retrace=True) + trt_module = torch_tensorrt.load(trt_ep_path).module() -.. code-block:: python +Cross-serialization (Python and C++) +------------------------------------- - trt_model = torch.compile( - model, - backend="tensorrt", - options={"use_python_runtime": True}, - ) +One of the key features of ``TorchTensorRTModule`` is seamless cross serialization: +**you can serialize an engine using the Python runtime and load it using the C++ runtime, or vice versa**. +The engine file format and all core metadata are fully compatible across runtimes and platforms, ensuring flexibility for production and development workflows. ----- +For example, you can: -Limitations ------------ +- **Build and serialize in Python**, then deploy by loading the module in a C++-enabled environment (e.g. in TorchScript or when the C++ extension is present): + + .. code-block:: python -* **Not serializable**: ``PythonTorchTensorRTModule`` cannot be saved via - ``torch_tensorrt.save()`` as an ``ExportedProgram`` or loaded back. The module is - Python-only in-process. + # In an environment with only Python runtime (PYTHON_ONLY=1) + torch_tensorrt.save(trt_module, "trt_module.ep") - .. code-block:: python + # --- Later, or on a different machine with C++ runtime enabled --- + trt_module = torch_tensorrt.load("trt_module.ep").module() + output = trt_module(input) - # This will raise an error with use_python_runtime=True: - torch_tensorrt.save(trt_gm, "model.ep", arg_inputs=inputs) +- **Build in C++ runtime environment**, save the engine, and then load it in a Python-only deployment or debugging context, with no changes needed. -* **No C++ deployment**: The compiled module cannot be exported to AOTInductor or used - in a C++ application without re-compiling with the C++ runtime. +This interoperability allows you to train, compile, and debug using the Python path, +but deploy for maximum performance using the C++ runtime—or test and profile using Python tools with modules built from C++. +**No extra conversion is required and the serialization format is shared across both backends.** +---- + +Limitations +----------- * **CUDAGraphs**: Whole-graph CUDAGraphs work with the Python runtime, but the per-submodule CUDAGraph recording in ``CudaGraphsTorchTensorRTModule`` is only available with the C++ runtime. - ---- -``PythonTorchTensorRTModule`` Direct Instantiation ----------------------------------------------------- +``TorchTensorRTModule`` from raw engine bytes +--------------------------------------------- -You can instantiate ``PythonTorchTensorRTModule`` directly from raw engine bytes, -for example when integrating a TRT engine built outside of Torch-TensorRT: +You can build a module directly from a serialized TensorRT engine (for example, an engine +produced outside Torch-TensorRT): .. code-block:: python - from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule + from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo._settings import CompilationSettings - # Load raw engine bytes (e.g., from trtexec output or torch_tensorrt.dynamo.convert_*) with open("model.engine", "rb") as f: engine_bytes = f.read() - module = PythonTorchTensorRTModule( + module = TorchTensorRTModule( serialized_engine=engine_bytes, input_binding_names=["x"], output_binding_names=["output"], @@ -104,23 +113,22 @@ for example when integrating a TRT engine built outside of Torch-TensorRT: output = module(torch.randn(1, 3, 224, 224).cuda()) -**Constructor arguments:** +**Constructor arguments** (see class docstring for full detail): ``serialized_engine`` (``bytes``) - The raw serialized TRT engine bytes. + Raw serialized TRT engine. -``input_binding_names`` (``List[str]``) +``input_binding_names`` / ``output_binding_names`` (``List[str]``) TRT input binding names in the order they are passed to ``forward()``. ``output_binding_names`` (``List[str]``) - TRT output binding names in the order they should be returned. + TRT output binding names in the order they are returned from ``forward()``. ``name`` (``str``, optional) - Human-readable name for the module (used in logging). + Name for logging and serialization. -``settings`` (``CompilationSettings``, optional) - The compilation settings used to build the engine. Used to determine device - placement and other runtime behaviors. +``settings`` (:class:`~torch_tensorrt.dynamo._settings.CompilationSettings`, optional) + Device and runtime options (must match how the engine was built). ``weight_name_map`` (``dict``, optional) Mapping of TRT weight names to PyTorch state dict names. Required for refit @@ -132,9 +140,10 @@ for example when integrating a TRT engine built outside of Torch-TensorRT: ---- -Runtime Selection Logic ------------------------- +Runtime selection summary +------------------------- -When ``use_python_runtime`` is ``None`` (auto-select), Torch-TensorRT tries to import -the C++ TorchBind class. If the C++ extension is not available it silently falls back to -the Python runtime. Pass ``True`` or ``False`` to force a specific runtime. +* ``TorchTensorRTModule`` uses the C++ engine path when the Torch-TensorRT extension is loaded; + otherwise it uses the Python ``TRTEngine`` path. +* If the C++ extension is **not** built, only the Python path is available. +* To use the Python runtime, set ``PYTHON_ONLY=1`` when building Torch-TensorRT. diff --git a/docsrc/user_guide/compilation/compilation_settings.rst b/docsrc/user_guide/compilation/compilation_settings.rst index 41085d3235..c048c447a0 100644 --- a/docsrc/user_guide/compilation/compilation_settings.rst +++ b/docsrc/user_guide/compilation/compilation_settings.rst @@ -51,12 +51,6 @@ Core Parameters * - ``device`` - current CUDA device - :class:`torch_tensorrt.Device` specifying the GPU to compile for. - * - ``use_python_runtime`` - - ``False`` (auto) - - ``False`` uses the C++ runtime (recommended — serializable, CUDAGraphs, - multi-device safe). ``True`` forces the Python runtime (simpler to instrument - for debugging but not serializable to ``ExportedProgram``). ``None`` selects C++ - if available. * - ``pass_through_build_failures`` - ``False`` - When ``True``, TRT engine build errors raise exceptions rather than fall back to PyTorch. diff --git a/docsrc/user_guide/compilation/torch_compile.rst b/docsrc/user_guide/compilation/torch_compile.rst index 77b20ed6ab..2de867a5a7 100644 --- a/docsrc/user_guide/compilation/torch_compile.rst +++ b/docsrc/user_guide/compilation/torch_compile.rst @@ -42,8 +42,7 @@ Custom Setting Usage "debug": True, "min_block_size": 2, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, - "optimization_level": 4, - "use_python_runtime": False,}) + "optimization_level": 4,}) .. note:: Supported precisions include FP32, FP16, BF16, INT8, FP8, and FP4. INT8/FP8 quantization requires the `ModelOpt `_ library and a quantized model. FP4 requires TensorRT >= 10.8.0. diff --git a/docsrc/user_guide/runtime_performance/mutable_module.rst b/docsrc/user_guide/runtime_performance/mutable_module.rst index beacc0c717..3b9ac360e1 100644 --- a/docsrc/user_guide/runtime_performance/mutable_module.rst +++ b/docsrc/user_guide/runtime_performance/mutable_module.rst @@ -34,7 +34,7 @@ stays identical: # The only extra line you need pipe.unet = torch_tensorrt.MutableTorchTensorRTModule( pipe.unet, - use_python_runtime=True, + use_explicit_typing=True, # pipeline already loaded in float16 via torch_dtype ) The pipeline's ``unet`` is now backed by a TRT engine. The first call to ``pipe(...)`` @@ -162,14 +162,9 @@ because it carries extra state — dynamic-shape descriptors, refit state, etc.: .. code-block:: python - # Requires use_python_runtime=False (the default) torch_tensorrt.MutableTorchTensorRTModule.save(mutable_module, "module.pkl") mutable_module = torch_tensorrt.MutableTorchTensorRTModule.load("module.pkl") -``use_python_runtime=True`` (used in the diffusers examples for pipeline compatibility) -does **not** support save/load. Switch to the default C++ runtime if serialization is -required. - ---- How the Refit / Recompile Decision Works diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py index 404f620966..1106f24054 100644 --- a/examples/apps/flux_demo.py +++ b/examples/apps/flux_demo.py @@ -119,7 +119,6 @@ def forward_loop(mod): "prefer_deferred_runtime_asserts_over_guards": True, "truncate_double": True, "min_block_size": 1, - "use_python_runtime": True, "immutable_weights": False, "offload_module_to_cpu": args.low_vram_mode, } diff --git a/examples/distributed_inference/data_parallel_stable_diffusion.py b/examples/distributed_inference/data_parallel_stable_diffusion.py index 023d7e8e63..b99589e4d9 100644 --- a/examples/distributed_inference/data_parallel_stable_diffusion.py +++ b/examples/distributed_inference/data_parallel_stable_diffusion.py @@ -40,7 +40,6 @@ options={ "truncate_long_and_double": True, "precision": torch.float16, - "use_python_runtime": True, }, dynamic=False, ) diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py index 8ba8a07e33..54f45826d7 100644 --- a/examples/dynamo/autocast_example.py +++ b/examples/dynamo/autocast_example.py @@ -73,7 +73,6 @@ def forward(self, x): ep.module(), arg_inputs=inputs, min_block_size=1, - use_python_runtime=True, enable_autocast=True, autocast_low_precision_type=torch.bfloat16, autocast_excluded_nodes={"^conv1$", "relu"}, diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py index f5d62a47ec..2514f5354b 100644 --- a/examples/dynamo/custom_kernel_plugins.py +++ b/examples/dynamo/custom_kernel_plugins.py @@ -277,7 +277,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Node: torch.ops.torchtrt_ex.triton_circular_pad.default, with layer location: __/triton_circular_pad # Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner # -# Compiled with: CompilationSettings(workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False) +# Compiled with: CompilationSettings(workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False) # # Graph Structure: # @@ -580,7 +580,7 @@ def circular_padding_converter( # # The graph consists of 2 Total Operators, of which 2 operators are supported, 100.0% coverage # -# Compiled with: CompilationSettings(workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False) +# Compiled with: CompilationSettings(workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False) # # Graph Structure: # diff --git a/examples/dynamo/debugger_example.py b/examples/dynamo/debugger_example.py index b88a4f22d5..16ee4cdd18 100644 --- a/examples/dynamo/debugger_example.py +++ b/examples/dynamo/debugger_example.py @@ -35,7 +35,6 @@ exp_program = torch.export.export(model, tuple(inputs)) workspace_size = 20 << 30 min_block_size = 0 -use_python_runtime = False torch_executed_ops = {} with torch_trt.dynamo.Debugger( @@ -52,7 +51,6 @@ trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, immutable_weights=False, diff --git a/examples/dynamo/dynamic_memory_allocation.py b/examples/dynamo/dynamic_memory_allocation.py index 093a539dab..14648fbc55 100644 --- a/examples/dynamo/dynamic_memory_allocation.py +++ b/examples/dynamo/dynamic_memory_allocation.py @@ -46,7 +46,6 @@ settings = { "ir": "dynamo", - "use_python_runtime": False, "immutable_weights": False, "lazy_engine_init": True, "dynamically_allocate_resources": True, diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index ae057ea5e0..5856bcb9c1 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -47,7 +47,6 @@ def compile_bert(iterations=3): start.record() compilation_kwargs = { - "use_python_runtime": False, "truncate_double": True, "min_block_size": 1, "immutable_weights": False, diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 8ebaac1981..f981ea26f5 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -39,7 +39,6 @@ model = models.resnet18(pretrained=True).to("cuda").eval() min_block_size = 1 -use_python_runtime = False def remove_timing_cache(path=TIMING_CACHE_PATH): @@ -91,7 +90,6 @@ def torch_compile(iterations=3): model, backend="tensorrt", options={ - "use_python_runtime": True, "min_block_size": min_block_size, "immutable_weights": False, "cache_built_engines": cache_built_engines, @@ -150,7 +148,6 @@ def dynamo_compile(iterations=3): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, cache_built_engines=cache_built_engines, @@ -259,7 +256,6 @@ def torch_compile_my_cache(iterations=3): model, backend="tensorrt", options={ - "use_python_runtime": True, "min_block_size": min_block_size, "immutable_weights": False, "cache_built_engines": cache_built_engines, diff --git a/examples/dynamo/low_cpu_memory_compilation.py b/examples/dynamo/low_cpu_memory_compilation.py index 82ba47ee84..8d9e00f3e1 100644 --- a/examples/dynamo/low_cpu_memory_compilation.py +++ b/examples/dynamo/low_cpu_memory_compilation.py @@ -54,10 +54,8 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] -use_python_runtime = False compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py index 7df34b31ef..0ef1ddaa9a 100644 --- a/examples/dynamo/mutable_torchtrt_module_example.py +++ b/examples/dynamo/mutable_torchtrt_module_example.py @@ -32,7 +32,6 @@ # Initialize the Mutable Torch TensorRT Module with settings. # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ settings = { - "use_python_runtime": False, "immutable_weights": False, } @@ -66,7 +65,6 @@ # Saving Mutable Torch TensorRT Module # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Currently, saving is only enabled when "use_python_runtime" = False in settings torch_trt.MutableTorchTensorRTModule.save(mutable_module, "mutable_module.pkl") reload = torch_trt.MutableTorchTensorRTModule.load("mutable_module.pkl") @@ -77,7 +75,6 @@ with torch.no_grad(): settings = { - "use_python_runtime": True, "immutable_weights": False, } @@ -209,7 +206,6 @@ def forward(self, a, b, c={}): example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) model = torch_trt.MutableTorchTensorRTModule( model, - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=True, diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py index 844bc19242..00bdbd0029 100644 --- a/examples/dynamo/refit_engine_example.py +++ b/examples/dynamo/refit_engine_example.py @@ -57,12 +57,10 @@ exp_program = torch.export.export(model, tuple(inputs)) workspace_size = 20 << 30 min_block_size = 0 -use_python_runtime = False torch_executed_ops = {} trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, immutable_weights=False, diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py index f5901cfe40..1ef3240210 100644 --- a/examples/dynamo/torch_compile_advanced_usage.py +++ b/examples/dynamo/torch_compile_advanced_usage.py @@ -76,7 +76,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): "min_block_size": 2, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "optimization_level": 4, - "use_python_runtime": False, } # Run the model on an input to cause compilation, as so: diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index c3ce6cc261..097ef3e5e4 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -534,7 +534,7 @@ def convert_method_to_trt_engine( module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs ) - return dynamo_convert_exported_program_to_serialized_trt_engine( + return dynamo_convert_exported_program_to_serialized_trt_engine( # type: ignore[no-any-return] exp_program, arg_inputs=tuple(arg_inputs), kwarg_inputs=torchtrt_kwarg_inputs, @@ -582,35 +582,42 @@ def load( Raises: ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file """ + # Ensure Python TRT engine ops are registered so torch.export.load can + # resolve tensorrt::execute_engine when the C++ runtime is absent. + if not ENABLED_FEATURES.torch_tensorrt_runtime: + import torch_tensorrt.dynamo.runtime._TRTEngine # noqa: F401 try: - logger.debug(f"Loading the provided file {file_path} using torch.jit.load()") - ts_module = function_overload_with_kwargs( + logger.debug(f"Loading the provided file {file_path} using torch.export.load()") + exp_program = function_overload_with_kwargs( torch.export.load, file_path, extra_files=extra_files, **kwargs, ) - return ts_module + return exp_program + except Exception: + import traceback + + traceback.print_exc() logger.info( f"Loading the provided file {file_path} via torch.export.load() failed with the following error", exc_info=True, ) - pass try: - logger.debug(f"Loading the provided file {file_path} using torch.export.load()") - exp_program = function_overload_with_kwargs( + logger.debug(f"Loading the provided file {file_path} using torch.jit.load()") + ts_module = function_overload_with_kwargs( torch.jit.load, file_path, _extra_files=extra_files, **kwargs, ) - return exp_program - except Exception: + return ts_module + except Exception as e: logger.info( - f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error", + f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error: {e}", exc_info=True, ) raise ValueError( @@ -793,8 +800,8 @@ def _all_are_input_objects(obj: Any) -> bool: f"Inferred dynamic_shapes from torch_tensorrt.Input objects with min/opt/max specifications: {dynamic_shapes}" ) - arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device())) # type: ignore - kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device()) # type: ignore + arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device())) # type: ignore[arg-type] + kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device()) # type: ignore[assignment] else: # Mixed case: some inputs are Tensors, some are Input objects @@ -876,6 +883,7 @@ def _extract_tensor(obj: Any) -> Any: "Provided model is a torch.export.ExportedProgram, inputs or arg_inputs is not necessary during save, it uses the inputs or arg_inputs provided during export and compile" ) if output_format == "exported_program": + _normalize_engine_constants_to_python(module) function_overload_with_kwargs( torch.export.save, module, @@ -933,6 +941,7 @@ def _extract_tensor(obj: Any) -> Any: use_legacy_exporter=_use_legacy, ) if output_format == "exported_program": + _normalize_engine_constants_to_python(exp_program) function_overload_with_kwargs( torch.export.save, exp_program, @@ -1012,6 +1021,7 @@ def _extract_tensor(obj: Any) -> Any: ) if output_format == "exported_program": + _normalize_engine_constants_to_python(exp_program) function_overload_with_kwargs( torch.export.save, exp_program, @@ -1036,6 +1046,42 @@ def _extract_tensor(obj: Any) -> Any: ) +def _normalize_engine_constants_to_python(exp_program: "ExportedProgram") -> None: + pass + + +# TODO: Uncomment this when cross serialization is enabled +# """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. + +# The C++ runtime stores engine constants as ``torch._C.ScriptObject`` +# (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as +# an opaque type so ``torch.export`` can serialise it with ``pickle``. By +# converting before save the artifact is portable across both runtimes. +# """ +# import base64 + +# from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX +# from torch_tensorrt.dynamo.runtime._TRTEngine import ( +# EngineSerializer, +# TRTEngine, +# ) + +# for fqn, constant in list(exp_program.constants.items()): +# if isinstance(constant, (torch._C.ScriptObject, TRTEngine)): + +# state = constant.__getstate__() +# if len(state) == 2 and ( +# state[1] == "TRTEngine" +# or state[1] == "__torch__.torch.classes.tensorrt.Engine" +# ): +# serialized_info = list(state[0]) +# serialized_info[ENGINE_IDX] = base64.b64decode( +# serialized_info[ENGINE_IDX] +# ) +# exp_program.constants[fqn] = EngineSerializer(serialized_info) + + +# def function_overload_with_kwargs( fn: Callable[..., Any], *args: Any, **kwargs: Any ) -> Any: diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py index 637843eaeb..26746ddfff 100644 --- a/py/torch_tensorrt/_enums.py +++ b/py/torch_tensorrt/_enums.py @@ -1402,7 +1402,8 @@ def current_platform(cls) -> Platform: return Platform.UNKNOWN def __str__(self) -> str: - return str(self.name) + # Make it compatible with C++ runtime + return self.name.lower() @needs_torch_tensorrt_runtime # type: ignore def _to_serialized_rt_platform(self) -> str: diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 4f9f61ca5f..29c2ed076a 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -12,7 +12,7 @@ from torch.fx.node import Target from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype -from torch_tensorrt._features import needs_cross_compile +from torch_tensorrt._features import ENABLED_FEATURES, needs_cross_compile from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults, partitioning from torch_tensorrt.dynamo._DryRunTracker import ( @@ -83,7 +83,7 @@ def cross_compile_for_windows( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -163,7 +163,7 @@ def cross_compile_for_windows( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -302,7 +302,6 @@ def cross_compile_for_windows( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": False, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, @@ -335,11 +334,11 @@ def cross_compile_for_windows( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "use_python_runtime": use_python_runtime, } # disable the following settings is not supported for cross compilation for windows feature unsupported_settings = ( - "use_python_runtime", "lazy_engine_init", "cache_built_engines", "reuse_cached_engines", @@ -425,7 +424,7 @@ def compile( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -520,7 +519,7 @@ def compile( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -691,7 +690,6 @@ def compile( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": use_python_runtime, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, @@ -733,6 +731,7 @@ def compile( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "use_python_runtime": use_python_runtime, } logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB") settings = CompilationSettings(**compilation_options) @@ -791,7 +790,7 @@ def _insert_complex_io_adapters( Outputs: insert view_as_complex before the output node for each originally-complex output that comes from a TRT block. - Leverages metadata that was captued when the complex rewriter pass was run + Leverages metadata that was captured when the complex rewriter pass was run """ complex_input_names = gm.meta.get("complex_input_names", []) complex_input_dtypes = gm.meta.get("complex_input_dtypes", {}) @@ -1127,7 +1126,7 @@ def preserve_module_specs( if _debugger_config: if _debugger_config.save_engine_profile: - if settings.use_python_runtime: + if not ENABLED_FEATURES.torch_tensorrt_runtime: if _debugger_config.profile_format != "cudagraph": raise ValueError( "Profiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization." @@ -1219,7 +1218,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -1295,7 +1294,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -1443,7 +1442,6 @@ def convert_exported_program_to_serialized_trt_engine( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": use_python_runtime, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, @@ -1475,6 +1473,7 @@ def convert_exported_program_to_serialized_trt_engine( "use_distributed_mode_trace": use_distributed_mode_trace, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "use_python_runtime": use_python_runtime, } if "runtime_cache_path" in compilation_options: compilation_options.pop("runtime_cache_path") diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 007b07db31..784066cc75 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -22,7 +22,6 @@ OPTIMIZATION_LEVEL = None SPARSE_WEIGHTS = False TRUNCATE_DOUBLE = False -USE_PYTHON_RUNTIME = False USE_FAST_PARTITIONER = True ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False REQUIRE_FULL_COMPILATION = False @@ -71,6 +70,7 @@ DECOMPOSE_ATTENTION = False ATTN_BIAS_IS_CAUSAL = True DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY = "lazy" +USE_PYTHON_RUNTIME = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index b482a8c839..c92fc77341 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -19,6 +19,7 @@ OutputSpec, TensorArgument, ) +from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX, NAME_IDX @@ -471,6 +472,8 @@ def inline_trt_modules( continue # Get the TRT submodule trt_module = getattr(gm, name) + if trt_module._use_python_runtime: + raise ValueError("Python runtime is not supported for serialization") # Ensure the trt module node in the main graph (gm) has inputs trt_module_node = [node for node in gm.graph.nodes if node.name == name] @@ -489,16 +492,12 @@ def inline_trt_modules( engine_info = trt_module._pack_engine_info() engine_bytes = engine_info[ENGINE_IDX] engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") - # insert the no_placeholder node in the graph which should be replaced to the actual execute_engine node while load in the windows trt_node = gm.graph.call_function( torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default, (trt_module_node.args, *engine_info), ) else: - # for the normal workflow: use the execute_engine node engine_name = f"{name}_engine" - # TODO: THROWS SOME WARNING ABOUT A LACK OF UNDERLYING REFERENCE TO THE OWNING GRAPH MODULE - # SAYS THERES 3 OPTIONS, SUBMODULE, PARAMETER, OR BUFFER, BUFFER SEEMS THE BEST BUT I THINK ITS KEYED TO TENSORS setattr(gm, engine_name, trt_module.engine) engine_node = gm.graph.get_attr(engine_name) @@ -506,13 +505,9 @@ def inline_trt_modules( torch.ops.tensorrt.execute_engine.default, (trt_module_node.args, engine_node), ) - # meta["val"] should be a lighter version of a tensor. For eg: it should be a FakeTensor (with output shape and dtype properties) - # Lighter version of a custom_obj is not defined clearly. meta["val"] does not have any type expectations but - # for custom object nodes, it should be CustomObjArgument engine_node.meta["val"] = CustomObjArgument( name=engine_node.name, class_fqn="" ) - # set trt_node.meta with trt_module_node.meta assert num_outputs > 0 trt_node.meta["val"] = trt_module_node.meta["val"] @@ -557,7 +552,12 @@ def replace_execute_engine_no_op_node( packed_engine_info[ENGINE_IDX] = base64.b64decode( engine_bytes.encode("utf-8") ) - trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info)) + if ENABLED_FEATURES.torch_tensorrt_runtime: + trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info)) + else: + from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine + + trt_engine = TRTEngine(packed_engine_info) setattr(gm, engine_name, trt_engine) engine_node = gm.graph.get_attr(engine_name) diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index a73e53d9d3..a1adf8036b 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -32,14 +32,12 @@ post_lowering, pre_export_lowering, ) -from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( - PythonTorchTensorRTModule, -) -from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( ENGINE_IDX, SERIALIZED_METADATA_IDX, - TorchTensorRTModule, ) +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule +from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine from torch_tensorrt.dynamo.utils import ( check_module_output, check_output_equal, @@ -308,11 +306,7 @@ def refit_module_weights( if ( not isinstance( submodule, - ( - PythonTorchTensorRTModule, - TorchTensorRTModule, - torch.nn.modules.module.Module, - ), + (TorchTensorRTModule, torch.nn.modules.module.Module), ) or "_run_on_gpu" in name ): @@ -509,9 +503,13 @@ def refit_module_weights( except AttributeError: if isinstance(compiled_submodule, torch.nn.Module): # Torch retrace module - assert ( - not settings.use_python_runtime - ), "Refitting a torch retraced module is only supported with use_python_runtime=False" + assert not isinstance( + compiled_submodule.engine, + TRTEngine, + ), ( + "Refitting a torch retraced module is only supported when " + "the engine uses the C++ Torch-TensorRT runtime" + ) encoded_metadata = [ engine for name, engine in compiled_submodules @@ -533,10 +531,10 @@ def refit_module_weights( "This engine does not have a weight map cache. Rebuilding the weight map" ) - # Rexporting the TRT compiled graph module and loading it back doesn't preserve the instance type and registers - # the compiled submodule as torch.nn.Module. So we use settings.use_python_runtime to determine the instance type. - if settings.use_python_runtime: - engine = compiled_submodule.engine + # Rexporting the TRT compiled graph module and loading it back doesn't preserve + # the instance type; choose the engine handle based on the actual engine object. + if isinstance(compiled_submodule.engine, TRTEngine): + engine = compiled_submodule.engine.cuda_engine else: engine_info = compiled_submodule.engine.__getstate__()[0] engine = get_engine_from_encoded_engine( @@ -592,12 +590,17 @@ def refit_module_weights( serialization_config.set_flag(trt.SerializationFlag.INCLUDE_REFIT) serialized_engine = engine.serialize_with_config(serialization_config) - if isinstance(compiled_submodule, PythonTorchTensorRTModule): - compiled_submodule.serialized_engine = bytes(serialized_engine) - elif isinstance(compiled_submodule, TorchTensorRTModule): - compiled_submodule.engine = None # Clear the engine for TorchTensorRTModule, otherwise it won't be updated - compiled_submodule.serialized_engine = bytes(serialized_engine) - compiled_submodule.setup_engine() + if isinstance(compiled_submodule, TorchTensorRTModule): + new_serialized_engine = bytes(serialized_engine) + compiled_submodule.serialized_engine = new_serialized_engine + if isinstance(compiled_submodule.engine, TRTEngine): + # Refit already updated ``cuda_engine`` in place; avoid deserialize (slow). + py_eng = compiled_submodule.engine + py_eng.serialized_info[ENGINE_IDX] = new_serialized_engine + py_eng.serialized_engine = new_serialized_engine + else: + compiled_submodule.engine = None + compiled_submodule.setup_engine() elif inline_module: new_engine_info = list(engine_info) new_engine_info[ENGINE_IDX] = bytes(serialized_engine) diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index c7ef3eed9b..3fe18e0a0d 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -73,9 +73,6 @@ class CompilationSettings: version_compatible (bool): Provide version forward-compatibility for engine plan files optimization_level (Optional[int]): Builder optimization 0-5, higher levels imply longer build time, searching for more optimization options. TRT defaults to 3 - use_python_runtime (Optional[bool]): Whether to strictly use Python runtime or C++ runtime. To auto-select a runtime - based on C++ dependency presence (preferentially choosing C++ runtime if available), leave the - argument as None truncate_double (bool): Whether to truncate float64 TRT engine inputs or weights to float32 use_fast_partitioner (bool): Whether to use the fast or global graph partitioning system enable_experimental_decompositions (bool): Whether to enable all core aten decompositions @@ -121,6 +118,7 @@ class CompilationSettings: dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + use_python_runtime (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). When ``False`` (default) the C++ runtime is used if available and the Python runtime is used as a fallback otherwise. """ workspace_size: int = WORKSPACE_SIZE @@ -130,7 +128,6 @@ class CompilationSettings: max_aux_streams: Optional[int] = MAX_AUX_STREAMS version_compatible: bool = VERSION_COMPATIBLE optimization_level: Optional[int] = OPTIMIZATION_LEVEL - use_python_runtime: Optional[bool] = USE_PYTHON_RUNTIME truncate_double: bool = TRUNCATE_DOUBLE use_fast_partitioner: bool = USE_FAST_PARTITIONER enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS @@ -184,6 +181,7 @@ class CompilationSettings: dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES decompose_attention: bool = DECOMPOSE_ATTENTION attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL + use_python_runtime: bool = USE_PYTHON_RUNTIME def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 77e48fe92e..d712d7f150 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -4,6 +4,7 @@ import logging from typing import Any, Dict, List, NamedTuple, Optional, Sequence +import tensorrt as trt import torch from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES @@ -17,7 +18,7 @@ TRTInterpreter, TRTInterpreterResult, ) -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ( get_cpu_memory_usage, get_output_dtypes, @@ -25,8 +26,6 @@ ) from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt - logger = logging.getLogger(__name__) @@ -334,7 +333,7 @@ def convert_module( settings: CompilationSettings = CompilationSettings(), name: str = "", engine_cache: Optional[BaseEngineCache] = None, -) -> PythonTorchTensorRTModule | TorchTensorRTModule: +) -> TorchTensorRTModule: """Convert an FX module to a TRT module Args: module: FX GraphModule to convert @@ -343,22 +342,13 @@ def convert_module( name: TRT engine name engine_cache: Engine cache instance Returns: - PythonTorchTensorRTModule or TorchTensorRTModule + TorchTensorRTModule """ serialized_interpreter_result = interpret_module_to_result( module, inputs, settings, engine_cache=engine_cache ) - rt_cls = PythonTorchTensorRTModule - - if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime: - from torch_tensorrt.dynamo.runtime import TorchTensorRTModule - - rt_cls = TorchTensorRTModule - - elif ( - not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime - ): + if not ENABLED_FEATURES.torch_tensorrt_runtime: logger.info( "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available" ) @@ -379,7 +369,7 @@ def convert_module( "For TRT-LLM fallback, set TRTLLM_PLUGINS_PATH or USE_TRTLLM_PLUGINS=1." ) - return rt_cls( + return TorchTensorRTModule( serialized_engine=serialized_interpreter_result.serialized_engine, input_binding_names=list(serialized_interpreter_result.input_names), output_binding_names=list(serialized_interpreter_result.output_names), diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index 4ecabea36e..dc542363ae 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -74,7 +74,6 @@ def __init__( pytorch_model: torch.nn.Module, *, device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, @@ -110,7 +109,6 @@ def __init__( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -148,7 +146,6 @@ def __init__( self.prefer_deferred_runtime_asserts_over_guards = ( prefer_deferred_runtime_asserts_over_guards ) - self.use_python_runtime = use_python_runtime self.trt_device = to_torch_tensorrt_device(device) assert ( not immutable_weights @@ -368,7 +365,6 @@ def compile(self) -> None: arg_inputs=self.arg_inputs, kwarg_inputs=self.kwarg_inputs, immutable_weights=False, - use_python_runtime=self.use_python_runtime, **self.additional_settings, ) if self.additional_settings.get("offload_module_to_cpu", False): @@ -702,9 +698,6 @@ def resursivly_deserialize_dynamic_shape(obj: Any) -> None: @staticmethod def save(module: Any, path: str) -> None: # Cast the object back to MutableTorchTensorRTModule to save - assert ( - not module.use_python_runtime - ), "Python runtime does not support serialization. Save failed." module.init_finished = False module.__class__ = MutableTorchTensorRTModule exp_program = module.exp_program diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py deleted file mode 100644 index 3c454933bb..0000000000 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ /dev/null @@ -1,1030 +0,0 @@ -from __future__ import annotations - -import logging -import os -from contextlib import nullcontext -from typing import Any, Dict, List, Optional, Sequence, Tuple - -import torch -import torch.distributed as dist -import torch_tensorrt -from torch.nn import Module -from torch_tensorrt._Device import Device -from torch_tensorrt._enums import Platform, dtype -from torch_tensorrt._features import ENABLED_FEATURES -from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR -from torch_tensorrt.dynamo._settings import CompilationSettings -from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig -from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger -from torch_tensorrt.dynamo.utils import DYNAMIC_DIM -from torch_tensorrt.logging import TRT_LOGGER -from torch_tensorrt.runtime._utils import ( - _is_switch_required, - _select_rt_device, - multi_gpu_device_check, -) - -# must import after torch_tensorrt to resolve tensorrt_rtx alias -import tensorrt as trt # isort: skip - -logger = logging.getLogger(__name__) - - -def _get_dynamic_shapes_kernel_strategy(strategy_str: str) -> Any: - """Map strategy string to TRT enum. Only called on RTX builds.""" - return { - "lazy": trt.DynamicShapesKernelSpecializationStrategy.LAZY, - "eager": trt.DynamicShapesKernelSpecializationStrategy.EAGER, - "none": trt.DynamicShapesKernelSpecializationStrategy.NONE, - }.get(strategy_str, trt.DynamicShapesKernelSpecializationStrategy.LAZY) - - -class DynamicOutputAllocator(trt.IOutputAllocator): # type: ignore[misc] - def __init__(self, output_dtypes: Dict[str, torch.dtype]) -> None: - trt.IOutputAllocator.__init__(self) - self.buffers: Dict[str, torch.Tensor] = {} - self.shapes: Dict[str, Tuple[int, ...]] = {} - self.dtypes: Dict[str, torch.dtype] = output_dtypes - - def reallocate_output_async( - self, - tensor_name: str, - memory: int, - size: int, - alignment: int, - stream: torch.cuda.Stream, - ) -> Any: - shape = (size,) - if tensor_name not in self.buffers: - self.buffers[tensor_name] = torch.empty( - shape, - dtype=self.dtypes[tensor_name], - device=torch.cuda.current_device(), - ) - else: - if self.buffers[tensor_name].shape != shape: - self.buffers[tensor_name] = torch.empty( - shape, - dtype=self.dtypes[tensor_name], - device=torch.cuda.current_device(), - ) - return self.buffers[tensor_name].data_ptr() - - def notify_shape(self, tensor_name: str, shape: Tuple[int, ...]) -> None: - self.shapes[tensor_name] = tuple(shape) - - -class TorchTRTRuntimeStates: - def __init__(self, new_cudagraphs: bool): - # Indicates whether CUDAGraphs were enabled in the previous execute_engine - self.old_cudagraphs = new_cudagraphs - # Indicates whether pre-allocated output was enabled in the previous execute_engine - self.old_pre_allocated_outputs = False - # Indicates whether context has changed - self.context_changed = False - - def set_runtime_states( - self, - new_cudagraphs: bool, - new_pre_allocated_output: bool, - shape_changed: bool, - ) -> Tuple[bool, bool, bool]: - # Evaluates whether certain conditions are met to enable CUDA Graph recording or to use pre-allocated outputs - # based on the current and previous states, as well as input shape has changed - need_cudagraphs_record = False - can_use_pre_allocated_outputs = False - need_cudagraphs_reset = False - - # CUDA Graph recording is needed if CUDA graphs is enabled and: - # - CUDA graphs were previously disabled - # - or the shape has changed - # - or the execution context has changed (e.g., weight streaming) - if new_cudagraphs and ( - not self.old_cudagraphs or shape_changed or self.context_changed - ): - need_cudagraphs_record = True - - # Pre-allocated output can be used when previous and current state are true without shape change - if ( - self.old_pre_allocated_outputs - and new_pre_allocated_output - and (not shape_changed) - ): - can_use_pre_allocated_outputs = True - - if not new_cudagraphs or shape_changed or self.context_changed: - need_cudagraphs_reset = True - - self.old_cudagraphs = new_cudagraphs - self.old_pre_allocated_outputs = new_pre_allocated_output - # reset flag - self.context_changed = False - - return ( - need_cudagraphs_record, - can_use_pre_allocated_outputs, - need_cudagraphs_reset, - ) - - -@cls_supports_debugger -class PythonTorchTensorRTModule(Module): # type: ignore[misc] - """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - - This module is backed by the Torch-TensorRT runtime and is only compatible with - FX / Dynamo / Python deployments. This module cannot be serialized to torchscript via torch.jit.trace for C++ deployment. - """ - - def __init__( - self, - serialized_engine: Optional[bytes] = None, - input_binding_names: Optional[List[str]] = None, - output_binding_names: Optional[List[str]] = None, - *, - name: str = "", - settings: CompilationSettings = CompilationSettings(), - weight_name_map: Optional[dict[Any, Any]] = None, - requires_output_allocator: bool = False, - requires_native_multidevice: bool = False, - symbolic_shape_expressions: Optional[Dict[str, List[Dict[str, Any]]]] = None, - _debugger_config: Optional[DebuggerConfig] = None, - ): - """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs - a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine - - Arguments: - serialized_engine (bytes): Serialized TensorRT engine in the form of a bytearray - input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules - output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned - - Keyword Arguments: - name (str): Name for module - settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed - weight_name_map (dict): Mapping of engine weight name to state_dict weight name - requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators) - requires_native_multidevice (bool): Boolean flag indicating if the converter creates operators which require multiple devices to run (e.g. multi-device collective operations) - symbolic_shape_expressions (List[str]): List of symbolic shape expressions for each output binding - Example: - - .. code-block:: py - - trt_module = PythonTorchTensorRTModule( - engine_str, - input_binding_names=["x"], - output_binding_names=["output"], - name="my_module", - settings=CompilationSettings(device=torch.cuda.current_device) - ) - - """ - self.context: Any - self._debugger_config: Optional[DebuggerConfig] = _debugger_config - super(PythonTorchTensorRTModule, self).__init__() - self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict) - - # Run multi-gpu device check to validate engine instantiation - multi_gpu_device_check() - - self.name = name - self._input_buffers: List[torch.Tensor] = [] - self._output_buffers: List[torch.Tensor] = [] - self.cudagraph: Optional[torch.cuda.CUDAGraph] = None - self._caller_stream: Optional[torch.cuda.Stream] = None - self._engine_stream: Optional[torch.cuda.Stream] = None - - # TODO: Make the below a Dictionary {shape: cudagraph} - self.shape_key: Optional[str] = None - - # See https://github.com/pytorch/pytorch/blob/acfe237a71af609e837a34bb38048aa8acb8eb4d/torch/cuda/graphs.py#L92-L98 - # Unused currently - to be used by Dynamic Shape support implementation - self.memory_pool = None - - self.serialized_engine = serialized_engine - self.input_names = ( - input_binding_names if input_binding_names is not None else [] - ) - self.output_names = ( - output_binding_names if output_binding_names is not None else [] - ) - self.initialized = False - self.target_device_id = ( - settings.device.gpu_id - if settings.device is not None - else Device._current_device().gpu_id - ) - self.target_device_properties = torch.cuda.get_device_properties( - self.target_device_id - ) - self.profiling_enabled = ( - _debugger_config.save_engine_profile - if _debugger_config is not None - else False - ) - self.settings = settings - self.engine = None - self.weight_name_map = weight_name_map - self.target_platform = Platform.current_platform() - self.runtime_states = TorchTRTRuntimeStates( - torch_tensorrt.runtime.get_cudagraphs_mode() - ) - - self.cudagraphs_enabled = False - self.pre_allocated_outputs: List[torch.Tensor] = [] - self.use_pre_allocated_outputs = False - - self.requires_output_allocator = requires_output_allocator - self.output_allocator: Optional[DynamicOutputAllocator] = None - self.use_output_allocator_outputs = False - self.device = torch.cuda.current_device() - self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - # If the output tensor is not owned by the engine (output_tensors_are_unowned=True), we need to create a new output tensor in each forward pass - self.output_tensors_are_unowned = False - self.symbolic_shape_expressions = symbolic_shape_expressions - self._nccl_comm: Optional[Any] = None - self._has_nccl_ops: bool = requires_native_multidevice - - # Runtime cache state (TensorRT-RTX only) - self.runtime_config: Any = None - self.runtime_cache: Any = None - self.runtime_cache_path = settings.runtime_cache_path - - if self.serialized_engine is not None and not self.settings.lazy_engine_init: - self.setup_engine() - - def set_output_tensors_as_unowned(self, enabled: bool) -> None: - """ - Flag to set if the output tensors of this engine are solely owned by the Torch-TensorRT Runtime or if they might be shared with a user. - If the tensors are not owned by the runtime, then they must be recreated on every forward call which may have implications for performance. - Typically only the final engine in a graph requires output tensors to be unowned and there are performance gains to be had for intermediate engines to manage their own standing memory. - Therefore this should only be set to True for the final module in a graph and leave false for intermediate modules. - - Args: - enabled: bool - Whether to set the flag to True. - - """ - self.output_tensors_are_unowned = enabled - - def get_streamable_device_memory_budget(self) -> Any: - return self.engine.streamable_weights_size - - def get_automatic_device_memory_budget(self) -> Any: - return self.engine.get_weight_streaming_automatic_budget() - - def get_device_memory_budget(self) -> Any: - return self.engine.weight_streaming_budget_v2 - - def set_device_memory_budget(self, budget_bytes: int) -> int: - # Recreating the context because weight streaming budget cannot be modified while there are active context. - if self.context is not None: - del self.context - budget_bytes = self._set_device_memory_budget(budget_bytes) - self.context = self._create_context() - self.runtime_states.context_changed = True - return budget_bytes - - def _set_device_memory_budget(self, budget_bytes: int) -> int: - # Disable weight streaming for invalid budget size - if budget_bytes < 0: - budget_bytes = self.get_streamable_device_memory_budget() - self.engine.weight_streaming_budget_v2 = budget_bytes - if self.engine.weight_streaming_budget_v2 != budget_bytes: - logger.error(f"Failed to set weight streaming budget to {budget_bytes}") - budget_bytes = self.engine.weight_streaming_budget_v2 - if self.get_streamable_device_memory_budget() == budget_bytes: - logger.warning("Weight streaming is disabled") - - return budget_bytes - - def set_default_device_memory_budget(self) -> int: - budget_bytes = self.get_automatic_device_memory_budget() - # Set automatic weight streaming budget as default when context is created - logger.debug(f"Weight streaming budget set to {budget_bytes}B") - return self._set_device_memory_budget(budget_bytes) - - # Distributed functions - @property - def is_distributed(self) -> bool: - """Check if this module is running inside an active distributed context.""" - return bool( - dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1 - ) - - def setup_nccl_comm(self) -> None: - """Set up NCCL communicator from the active ProcessGroup. - - Uses the process group set by torch_tensorrt.distributed.distributed_context() if - active, otherwise falls back to the default world group. - Called lazily on first forward pass for distributed engines. - """ - from torch_tensorrt.distributed._distributed import get_active_group - - if not self.is_distributed: - return - - pg = get_active_group() - if pg is None or dist.get_backend(pg) != "nccl": - raise RuntimeError( - "Active ProcessGroup must use NCCL backend. " - "Use torch_tensorrt.distributed.distributed_context(group) to select a non-default group." - ) - - backend = pg._get_backend(torch.device("cuda")) - - # Force NCCL communicator initialization with a dummy collective - # Must use group=pg so the correct group's comm is initialized - # dist.all_reduce without group= only initializes the default world group. - dummy = torch.zeros(1, device="cuda") - dist.all_reduce(dummy, group=pg) - - comm_ptr = backend._comm_ptr() - if comm_ptr is None or comm_ptr == 0: - raise RuntimeError("Failed to get NCCL communicator from ProcessGroup") - - self._nccl_comm = comm_ptr - - # Bind communicator to TRT execution context (PyCapsule required by TRT Python API) - if self.context is not None: - import ctypes - - ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object - ctypes.pythonapi.PyCapsule_New.argtypes = [ - ctypes.c_void_p, - ctypes.c_char_p, - ctypes.c_void_p, - ] - comm_capsule = ctypes.pythonapi.PyCapsule_New(comm_ptr, None, None) - ok = self.context.set_communicator(comm_capsule) - if not ok: - raise RuntimeError( - f"TRT context.set_communicator() returned False for rank={dist.get_rank()}. " - f"comm_ptr={comm_ptr:#x}. Failed to bind NCCL communicator to TRT execution context." - ) - - logger.info( - f"NCCL comm set up (rank={dist.get_rank()}, world_size={dist.get_world_size()})" - ) - - def setup_engine(self) -> None: - assert ( - self.target_platform == Platform.current_platform() - ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})" - - self.initialized = True - runtime = trt.Runtime(TRT_LOGGER) - self.engine = runtime.deserialize_cuda_engine(self.serialized_engine) - if self.settings.enable_weight_streaming: - self.set_default_device_memory_budget() - self.context = self.engine.create_execution_context() - - if self._has_nccl_ops: - from torch_tensorrt.distributed._nccl_utils import ( - check_nccl_engine_requirements, - ) - - check_nccl_engine_requirements() - - # For engines with native NCCL collective layers, all ranks must - # have a live IExecutionContext before any rank executes a - # collective. Barrier here so a fast-compiling rank does not race - # ahead and issue an NCCL op while another rank is still inside - # deserialize_cuda_engine / create_execution_context. - if ( - dist.is_available() - and dist.is_initialized() - and dist.get_world_size() > 1 - ): - logger.debug( - "Barrier after execution context creation (distributed NCCL engine)" - ) - dist.barrier() - - if ENABLED_FEATURES.tensorrt_rtx: - self._setup_runtime_config() - - self.context = self._create_context() - assert self.context is not None, "Failed to create execution context" - assert self.engine.num_io_tensors == ( - len(self.input_names) + len(self.output_names) - ) - - self.input_dtypes = [ - dtype._from(self.engine.get_tensor_dtype(input_name)) - for input_name in self.input_names - ] - - self.input_shapes = [ - self.engine.get_tensor_shape(input_name) for input_name in self.input_names - ] - self.output_dtypes = [ - dtype._from(self.engine.get_tensor_dtype(output_name)).to(torch.dtype) - for output_name in self.output_names - ] - self.output_shapes = [ - self.engine.get_tensor_shape(output_name) - for output_name in self.output_names - ] - - if self.requires_output_allocator: - self.create_output_allocator() - - if torch_tensorrt.runtime.get_cudagraphs_mode(): - self.cudagraph = torch.cuda.CUDAGraph() - - self.is_shape_inference_io = { - input_name: self.engine.is_shape_inference_io(input_name) - for input_name in self.input_names - } - - def _setup_runtime_config(self) -> None: - """Create a RuntimeConfig with runtime cache for TensorRT-RTX. - - The runtime cache stores JIT compilation results to avoid repeated - compilation of kernels/graphs across inference runs. - """ - self.runtime_config = self.engine.create_runtime_config() - self.runtime_config.set_execution_context_allocation_strategy( - trt.ExecutionContextAllocationStrategy.STATIC - ) - self.runtime_config.dynamic_shapes_kernel_specialization_strategy = ( - _get_dynamic_shapes_kernel_strategy( - self.settings.dynamic_shapes_kernel_specialization_strategy - ) - ) - logger.info( - f"Dynamic shapes kernel specialization strategy: {self.settings.dynamic_shapes_kernel_specialization_strategy}" - ) - self.runtime_cache = self.runtime_config.create_runtime_cache() - self._load_runtime_cache() - self.runtime_config.set_runtime_cache(self.runtime_cache) - logger.info("TensorRT-RTX runtime cache configured") - - def _create_context(self) -> trt.IExecutionContext: - """Create an execution context, using RuntimeConfig for RTX.""" - if ENABLED_FEATURES.tensorrt_rtx and self.runtime_config is not None: - return self.engine.create_execution_context(self.runtime_config) - return self.engine.create_execution_context() - - def _load_runtime_cache(self) -> None: - """Load runtime cache from disk if it exists (with shared file lock).""" - if self.runtime_cache is None: - return - if not os.path.isfile(self.runtime_cache_path): - logger.debug(f"No existing runtime cache at {self.runtime_cache_path}") - return - try: - from filelock import FileLock - - lock = FileLock(self.runtime_cache_path + ".lock") - with lock.acquire(timeout=10): - with open(self.runtime_cache_path, "rb") as f: - data = f.read() - if data: - self.runtime_cache.deserialize(data) - logger.info(f"Loaded runtime cache from {self.runtime_cache_path}") - except Exception as e: - logger.warning(f"Failed to load runtime cache: {e}") - - def _save_runtime_cache(self) -> None: - """Save runtime cache to disk (with exclusive file lock).""" - if self.runtime_cache is None: - return - try: - host_mem = self.runtime_cache.serialize() - if host_mem is None: - return - os.makedirs(os.path.dirname(self.runtime_cache_path), exist_ok=True) - - from filelock import FileLock - - lock = FileLock(self.runtime_cache_path + ".lock") - with lock.acquire(timeout=10): - with open(self.runtime_cache_path, "wb") as f: - f.write(memoryview(host_mem)) - logger.info(f"Saved runtime cache to {self.runtime_cache_path}") - except Exception as e: - logger.warning(f"Failed to save runtime cache: {e}") - - def _check_initialized(self) -> None: - if not self.initialized: - raise RuntimeError("PythonTorchTensorRTModule is not initialized.") - - def _on_state_dict(self, state_dict: Dict[str, Any], prefix: str, _: Any) -> None: - state_dict[prefix + "engine"] = self.serialized_engine - state_dict[prefix + "input_names"] = self.input_names - state_dict[prefix + "output_names"] = self.output_names - state_dict[prefix + "platform"] = self.target_platform - - def _load_from_state_dict( - self, - state_dict: Dict[str, Any], - prefix: str, - local_metadata: Any, - strict: Any, - missing_keys: Any, - unexpected_keys: Any, - error_msgs: Any, - ) -> None: - self.serialized_engine = state_dict[prefix + "engine"] - self.input_names = state_dict[prefix + "input_names"] - self.output_names = state_dict[prefix + "output_names"] - self.target_platform = state_dict[prefix + "platform"] - - # Same rationale as __setstate__: ensure these exist before - # setup_engine() so __del__ -> _save_runtime_cache() is safe even - # if a future caller invokes this without __init__ having run. - self.runtime_config = None - self.runtime_cache = None - - # Run multi-gpu device check to validate engine instantiation - multi_gpu_device_check() - self.setup_engine() - - def __getstate__(self) -> Dict[str, Any]: - state = self.__dict__.copy() - state.pop("engine", None) - state.pop("context", None) - # NCCLcomm cannot be pickled - state.pop("_nccl_comm", None) - state.pop("runtime_config", None) - state.pop("runtime_cache", None) - return state - - def __setstate__(self, state: Dict[str, Any]) -> None: - self.__dict__.update(state) - # reset after unpickling, apbose: is this required though? - self._nccl_comm = None - # __getstate__ pops these; re-initialize before setup_engine() so - # __del__ -> _save_runtime_cache() can always read them, including - # on standard (non-RTX) TRT where setup_engine() does not call - # _setup_runtime_config(). - self.runtime_config = None - self.runtime_cache = None - self.setup_engine() - - def __deepcopy__(self, memo: Any) -> PythonTorchTensorRTModule: - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - result.__setstate__(self.__getstate__()) - return result - - def _reset_captured_graph(self) -> None: - if self.cudagraph: - self.cudagraph.reset() - self.cudagraph = None - - def __del__(self) -> None: - self._save_runtime_cache() - self._reset_captured_graph() - - def setup_input_tensors( - self, - contiguous_inputs: List[torch.Tensor], - cudagraphs_enabled: bool, - need_cudagraphs_record: bool, - ) -> None: - for i, input_name in enumerate(self.input_names): - if not contiguous_inputs[i].is_cuda: - logger.warning( - f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. " - "This tensor is being moved by the runtime but for performance considerations, " - "ensure your inputs are all on GPU and open an issue here " - "(https://github.com/pytorch/TensorRT/issues) if this warning persists." - ) - contiguous_inputs = ( - contiguous_inputs[:i] - + [contiguous_inputs[i].cuda()] - + contiguous_inputs[i + 1 :] - ) - - assert ( - contiguous_inputs[i].dtype == self.input_dtypes[i] - ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." - - if need_cudagraphs_record: - # If cudagraphs is enabled, this memory is reserved for future cudagraph runs - # Clone is required to avoid re-using user-provided GPU memory - self._input_buffers[i] = contiguous_inputs[i].clone() - - # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers - # as per TensorRT requirements - if self.is_shape_inference_io[input_name]: - # Shape tensor inputs are casted to int64 explicitly - # Currently Torch CPU pointers are not working; numpy pointers are used instead - # to refer to underlying memory - inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() - self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) - else: - self.context.set_input_shape( - input_name, tuple(contiguous_inputs[i].shape) - ) - tensor_to_bind = contiguous_inputs[i] - if tensor_to_bind.numel() == 0: - # Use a single persistent placeholder for empty tensors (allocated once, reused) - if not hasattr(self, "_empty_tensor_placeholder"): - self._empty_tensor_placeholder = torch.empty( - 1, - dtype=tensor_to_bind.dtype, - device=torch.cuda.current_device(), - ) - tensor_to_bind = self._empty_tensor_placeholder - if cudagraphs_enabled: - self._input_buffers[i].copy_(contiguous_inputs[i]) - self.context.set_tensor_address( - input_name, self._input_buffers[i].data_ptr() - ) - else: - self.context.set_tensor_address( - input_name, tensor_to_bind.data_ptr() - ) - - def create_output_tensors(self) -> List[torch.Tensor]: - # create output tensors - outputs: List[torch.Tensor] = [] - - for o, _ in enumerate(self.output_names): - output = torch.empty( - size=self.output_shapes[o], - dtype=self.output_dtypes[o], - device=self.device, - ) - outputs.append(output) - return outputs - - def set_pre_allocated_outputs(self, enable: bool) -> None: - self.use_pre_allocated_outputs = enable - - def set_use_output_allocator(self, enable: bool) -> None: - self.use_output_allocator_outputs = enable - - def create_output_allocator(self) -> None: - if self.output_allocator is None: - output_dtypes_dict = {} - for o, output_name in enumerate(self.output_names): - output_dtypes_dict[output_name] = self.output_dtypes[o] - self.output_allocator = DynamicOutputAllocator(output_dtypes_dict) - - def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]: - def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: - shape_changed = self.validate_input_shapes(contiguous_inputs) - ( - need_cudagraphs_record, - can_use_pre_allocated_outputs, - need_cudagraphs_reset, - ) = self.runtime_states.set_runtime_states( - self.cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed - ) - - if need_cudagraphs_reset: - self._reset_captured_graph() - - if need_cudagraphs_record: - self._input_buffers = [None] * len(self.input_names) - self._output_buffers = [None] * len(self.output_names) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessInputs" - ) - if self.profiling_enabled - else nullcontext() - ): - assert len(contiguous_inputs) == len( - self.input_names - ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." - - self.setup_input_tensors( - contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record - ) - - if shape_changed: - # Check if input shapes can be inferred. - uninferred_input_names = self.context.infer_shapes() - if uninferred_input_names: - logger.warning( - f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \ - This could happen if the input tensor addresses/shapes haven't been configured correctly" - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessOutputs" - ) - if self.profiling_enabled - else nullcontext() - ): - if can_use_pre_allocated_outputs: - outputs = self.pre_allocated_outputs - else: - self.output_shapes = [ - tuple(self.context.get_tensor_shape(output_name)) - for output_name in self.output_names - ] - if DYNAMIC_DIM in self.output_shapes: - raise ValueError( - "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." - ) - outputs = self.create_output_tensors() - - for o, output_name in enumerate(self.output_names): - if need_cudagraphs_record: - self._output_buffers[o] = outputs[o].clone() - - if self.cudagraphs_enabled: - self.context.set_tensor_address( - output_name, self._output_buffers[o].data_ptr() - ) - else: - self.context.set_tensor_address( - output_name, outputs[o].data_ptr() - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:TensorRTRuntime" - ) - if self.profiling_enabled - else nullcontext() - ): - self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): - self._engine_stream = torch.cuda.Stream() - - self._engine_stream.wait_stream(self._caller_stream) - - with torch.cuda.stream(self._engine_stream): - if self.cudagraphs_enabled: - if need_cudagraphs_record: - self.cudagraph = torch.cuda.CUDAGraph() - - if self.profiling_enabled: - self.cudagraph.enable_debug_mode() - - with torch.cuda.graph( - self.cudagraph, stream=self._engine_stream - ): - self.context.execute_async_v3( - self._engine_stream.cuda_stream - ) - - if self.profiling_enabled: - self.cudagraph.debug_dump( - f"{DEBUG_LOGGING_DIR}/{self.name}_cudagraph.dot" - ) - - self.cudagraph.replay() # type: ignore - - else: - self.context.execute_async_v3(self._engine_stream.cuda_stream) - - self._caller_stream.wait_stream(self._engine_stream) - - # When the pre-allocated output mode is turned on, for intermediate modules, we only create the output in the first execution or when shape is changed. - if self.use_pre_allocated_outputs and ( - self.output_tensors_are_unowned - or not self.pre_allocated_outputs - or shape_changed - ): - self.pre_allocated_outputs = self.create_output_tensors() - - if self.cudagraphs_enabled: - for idx, o in enumerate(outputs): - o.copy_(self._output_buffers[idx]) - - if len(outputs) == 1: - return outputs[0] - - return outputs - - def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]: - assert ( - not torch_tensorrt.runtime.get_cudagraphs_mode() - ), "CUDA Graphs are not compatible with OutputAllocator." - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessInputs" - ) - if self.profiling_enabled - else nullcontext() - ): - assert len(contiguous_inputs) == len( - self.input_names - ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." - - self.setup_input_tensors(contiguous_inputs, False, False) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:SetupOutputAllocator" - ) - if self.profiling_enabled - else nullcontext() - ): - self.create_output_allocator() - # need to set output allocator every run - for output_name in self.output_names: - if not self.context.set_output_allocator( - output_name, self.output_allocator - ): - raise RuntimeError( - f"Failed to set output allocator for {output_name}" - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:TensorRTRuntime" - ) - if self.profiling_enabled - else nullcontext() - ): - self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): - self._engine_stream = torch.cuda.Stream() - - self._engine_stream.wait_stream(self._caller_stream) - - with torch.cuda.stream(self._engine_stream): - self.context.execute_async_v3( - self._engine_stream.cuda_stream - ) # The OutputAllocator is called by execute_async_v3() - - self._caller_stream.wait_stream(self._engine_stream) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessOutputs" - ) - if self.profiling_enabled - else nullcontext() - ): - outputs = [] - assert self.output_allocator is not None - for o, output_name in enumerate(self.output_names): - shape = self.output_allocator.shapes.get(output_name, None) - dtype = self.output_dtypes[o] - output = ( - self.output_allocator.buffers.get(output_name, None) - .clone() - .detach() - ) - prod = int(torch.prod(torch.tensor(shape))) - # When using the OutputAllocator, the allocated buffer might be larger than the size of the output, - # so we need to reshape the buffer to the output shape - output = output.reshape(-1).view(dtype)[:prod].reshape(shape) - outputs.append(output) - - if len(outputs) == 1: - return outputs[0] - - return outputs - - self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - - # Run forward function - contiguous_inputs: List[torch.Tensor] = [ - (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) - for i in inputs - ] - with ( - torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward") - if self.profiling_enabled - else nullcontext() - ): - self._check_initialized() - - if self._has_nccl_ops and self._nccl_comm is None: - nccl_type = ( - "native TRT collectives" - if ENABLED_FEATURES.native_trt_collectives - else ( - "TRT-LLM NCCL plugins" - if ENABLED_FEATURES.trtllm_for_nccl - else "unknown backend" - ) - ) - logger.info( - f"Setting up NCCL for distributed execution using {nccl_type} " - f"(rank={dist.get_rank()}, world_size={dist.get_world_size()})" - ) - self.setup_nccl_comm() - logger.info(f"NCCL setup complete, comm={self._nccl_comm}") - - # If in safe mode, check at each iteration for whether a switch is required - if ( - torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE - ): - curr_device_id = torch.cuda.current_device() - curr_device_properties = torch.cuda.get_device_properties( - curr_device_id - ) - logger.debug(f"Current Device: cuda:{curr_device_id}") - - # If a switch is required, move all inputs to new device and set as active device - if _is_switch_required( - curr_device_id, - self.target_device_id, - curr_device_properties, - self.target_device_properties, - ): - device_id, _ = _select_rt_device( - curr_device_id, - self.target_device_id, - self.target_device_properties, - ) - - # Update current device - device = torch.device(device_id) - torch.cuda.set_device(device_id) - - contiguous_inputs = [ - tensor.to(device) for tensor in contiguous_inputs - ] - logger.warning(f"Moved all input Tensors to cuda:{device_id}") - - if self.requires_output_allocator: # engine requires OA - if self.cudagraphs_enabled: - raise RuntimeError( - "The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs." - ) - logger.debug("Using the dynamic allocator runtime mode.") - return run_output_allocator() - else: - if self.use_output_allocator_outputs: # users call OA context manager - if self.cudagraphs_enabled: - raise RuntimeError( - "Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two." - ) - logger.debug("Using the dynamic allocator runtime mode.") - return run_output_allocator() - else: - logger.debug( - f"Using the standard execution runtime mode with cudagraphs={self.cudagraphs_enabled}." - ) - return run_standard_execution() - - def enable_profiling(self, profiler: "trt.IProfiler" = None) -> None: - """ - Enable TensorRT profiling. After calling this function, TensorRT will report - time spent on each layer in stdout for each forward run. - """ - self._check_initialized() - - if not self.context.profiler: - self.context.profiler = trt.Profiler() if profiler is None else profiler - - self.profiling_enabled = True - - def disable_profiling(self) -> None: - """ - Disable TensorRT profiling. - """ - self._check_initialized() - torch.cuda.synchronize() - del self.context - self.context = self._create_context() - self.profiling_enabled = False - - def get_layer_info(self) -> str: - """ - Get layer info of the engine. Only support for TRT > 8.2. - """ - inspector = self.engine.create_engine_inspector() - engine_json: str = inspector.get_engine_information( - trt.LayerInformationFormat.JSON - ) - return engine_json - - def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: - """ - Validates the input shapes of the forward function has changed - """ - # Representation of input shapes to a given model - # Shapes are concatenated as so: - # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5) - if not all(isinstance(t, torch.Tensor) for t in inputs): - return True - - new_shape_key = "".join( - str(tuple(t.shape)).replace(" ", "") - for t in inputs - if isinstance(t, torch.Tensor) - ) - - # If the new shape key differs from the existing one, - # invalidate the old shape key and remove the CUDAGraph - if new_shape_key != self.shape_key: - logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}") - self.shape_key = new_shape_key - return True - - return False - - def are_output_tensors_unowned(self) -> bool: - return self.output_tensors_are_unowned diff --git a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py new file mode 100644 index 0000000000..74d363752f --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py @@ -0,0 +1,933 @@ +"""Python-side TensorRT engine: deserialize and execute TRT engines without the C++ runtime. + +Serialization layout lives in :mod:`torch_tensorrt.dynamo.runtime._serialized_engine_layout`. +When the C++ Torch-TensorRT runtime is unavailable, :class:`TRTEngine` is registered as an +opaque type and ``tensorrt::execute_engine`` is registered as a Python custom op so that the +same compiled graph can run on either the C++ or Python runtime transparently. +""" + +from __future__ import annotations + +import base64 +import copy +import logging +import pickle +import tempfile +from contextlib import nullcontext +from types import SimpleNamespace +from typing import Any, ContextManager, Dict, List, Optional, Sequence, Tuple, cast + +import torch +import torch.distributed as dist +import torch_tensorrt +from torch._library.opaque_object import register_opaque_type +from torch._opaque_base import OpaqueBase +from torch_tensorrt._enums import dtype +from torch_tensorrt._features import ENABLED_FEATURES +from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR +from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + ABI_TARGET_IDX, + DEVICE_IDX, + ENGINE_IDX, + HW_COMPATIBLE_IDX, + INPUT_BINDING_NAMES_IDX, + NAME_IDX, + OUTPUT_BINDING_NAMES_IDX, + REQUIRES_NATIVE_MULTIDEVICE_IDX, + REQUIRES_OUTPUT_ALLOCATOR_IDX, + RESOURCE_ALLOCATION_STRATEGY_IDX, + SERIALIZATION_LEN, + SERIALIZED_METADATA_IDX, + TARGET_PLATFORM_IDX, + SerializedTensorRTEngineFmt, + deserialize_binding_names, + parse_device_info, +) +from torch_tensorrt.logging import TRT_LOGGER +from torch_tensorrt.runtime._utils import ( + _is_switch_required, + _select_rt_device, + multi_gpu_device_check, +) + +# must import after torch_tensorrt to resolve tensorrt_rtx alias +import tensorrt as trt # isort: skip + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# TRT I/O helpers +# --------------------------------------------------------------------------- + + +class DynamicOutputAllocator(trt.IOutputAllocator): # type: ignore[misc] + def __init__(self, output_dtypes: Dict[str, torch.dtype]) -> None: + trt.IOutputAllocator.__init__(self) + self.buffers: Dict[str, torch.Tensor] = {} + self.shapes: Dict[str, Tuple[int, ...]] = {} + self.dtypes: Dict[str, torch.dtype] = output_dtypes + + def reallocate_output_async( + self, + tensor_name: str, + memory: int, + size: int, + alignment: int, + stream: torch.cuda.Stream, + ) -> Any: + shape = (size,) + if tensor_name not in self.buffers or self.buffers[tensor_name].shape != shape: + self.buffers[tensor_name] = torch.empty( + shape, + dtype=self.dtypes[tensor_name], + device=torch.cuda.current_device(), + ) + return self.buffers[tensor_name].data_ptr() + + def notify_shape(self, tensor_name: str, shape: Tuple[int, ...]) -> None: + self.shapes[tensor_name] = tuple(shape) + + +class TorchTRTRuntimeStates: + """Tracks CUDA graph / pre-allocated-output state across invocations.""" + + def __init__(self, new_cudagraphs: bool): + self.old_cudagraphs = new_cudagraphs + self.old_pre_allocated_outputs = False + self.context_changed = False + + def set_runtime_states( + self, + new_cudagraphs: bool, + new_pre_allocated_output: bool, + shape_changed: bool, + ) -> Tuple[bool, bool, bool]: + need_cudagraphs_record = False + can_use_pre_allocated_outputs = False + need_cudagraphs_reset = False + + if new_cudagraphs and ( + not self.old_cudagraphs or shape_changed or self.context_changed + ): + need_cudagraphs_record = True + + if ( + self.old_pre_allocated_outputs + and new_pre_allocated_output + and (not shape_changed) + ): + can_use_pre_allocated_outputs = True + + if not new_cudagraphs or shape_changed or self.context_changed: + need_cudagraphs_reset = True + + self.old_cudagraphs = new_cudagraphs + self.old_pre_allocated_outputs = new_pre_allocated_output + self.context_changed = False + + return ( + need_cudagraphs_record, + can_use_pre_allocated_outputs, + need_cudagraphs_reset, + ) + + +# --------------------------------------------------------------------------- +# Pickle reconstruction — returns the right engine type for the current runtime +# --------------------------------------------------------------------------- + +# TODO: Uncomment this when cross serialization is enabled + +# def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: +# """Reconstruct a TRT engine from its serialized info list. + +# Called by pickle when deserializing a ``TRTEngine``. Checks which runtime +# is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or +# a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across +# runtimes. +# """ +# serialized_info = list(serialized_info) +# engine_field = serialized_info[ENGINE_IDX] +# if isinstance(engine_field, str): +# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) +# elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): +# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) + +# if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: +# return torch.classes.tensorrt.Engine(tuple(serialized_info)) + +# return TRTEngine(serialized_info) + + +# class EngineSerializer(OpaqueBase): # type: ignore[misc] +# def __init__(self, serialized_info: SerializedTensorRTEngineFmt) -> None: +# self.serialized_info = serialized_info + +# def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: +# """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. + +# The reconstruction function checks which runtime is available at +# load time and returns either a C++ ``torch.classes.tensorrt.Engine`` +# or a Python ``TRTEngine``, so a single saved artifact works on both. +# """ +# state = list(self.serialized_info) +# state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") +# return (_reconstruct_trt_engine, (state,)) + + +# --------------------------------------------------------------------------- +# TRTEngine (Python implementation) +# --------------------------------------------------------------------------- + + +class TRTEngine(OpaqueBase): # type: ignore[misc] + """TensorRT engine + execution context, driven from Python TRT APIs. + + Exposes the same surface as the C++ ``torch.classes.tensorrt.Engine`` TorchBind + class so that :class:`~torch_tensorrt.dynamo.runtime.TorchTensorRTModule` can use + either implementation without branching. When the C++ runtime is unavailable this + class is registered as an opaque type and ``tensorrt::execute_engine`` is registered + as a Python custom op pointing to :func:`execute_engine`. + """ + + # --- construction / teardown --- + + def __init__( + self, + serialized_info: SerializedTensorRTEngineFmt, + *, + profile_execution: bool = False, + ) -> None: + self._profile_execution = profile_execution + self.profile_path_prefix = tempfile.gettempdir() + self.use_pre_allocated_outputs = False + self.use_output_allocator_outputs = False + self.output_tensors_are_unowned = False + self.output_allocator: Optional[DynamicOutputAllocator] = None + self.pre_allocated_outputs: List[torch.Tensor] = [] + self._input_buffers: List[torch.Tensor] = [] + self._output_buffers: List[torch.Tensor] = [] + self._caller_stream: Optional[torch.cuda.Stream] = None + self._engine_stream: Optional[torch.cuda.Stream] = None + self.cudagraph: Optional[torch.cuda.CUDAGraph] = None + self.shape_key: Optional[str] = None + self._empty_tensor_placeholder: Optional[torch.Tensor] = None + self._dynamic_workspace: Optional[torch.Tensor] = None + self.runtime_states = TorchTRTRuntimeStates( + torch_tensorrt.runtime.get_cudagraphs_mode() + ) + self.resource_allocation_strategy = 0 + self._runtime_config = None + # NCCL communicator is bound lazily on the first forward pass for + # engines compiled with native multi-device collective layers. + self._nccl_comm: Optional[Any] = None + + self._load_serialized_info(serialized_info) + self._setup_engine() + + def __del__(self) -> None: + self.reset_captured_graph() + + def __deepcopy__(self, memo: dict[int, Any]) -> "TRTEngine": + """Rebuild from serialized layout so ``copy.deepcopy`` skips unpickleable TRT handles.""" + if id(self) in memo: + return memo[id(self)] # type: ignore + serialized_copy = copy.deepcopy(self.serialized_info, memo) + dup = type(self)(serialized_copy, profile_execution=self._profile_execution) + memo[id(self)] = dup + return dup + + def __str__(self) -> str: + return f"TRTEngine(name={self.name}, device={self.serialized_device_info})" + + def __repr__(self) -> str: + return self.__str__() + + def __getstate__(self) -> Tuple[List[Any], str]: + """Return pickle state in the same shape as C++ ``ScriptObject.__getstate__``. + + Outer tuple with a single element: the ``serialize()``-style list of string + slots, with ``ENGINE_IDX`` base64-encoded (matches ``def_pickle`` getter). + """ + serialized_info = list(self.serialized_info) + serialized_info[ENGINE_IDX] = base64.b64encode( + serialized_info[ENGINE_IDX] + ).decode("utf-8") + return (serialized_info, "TRTEngine") + + def __setstate__(self, state: Any) -> None: + """Restore from C++-matching pickle state ``(serialized_info,)``.""" + self._profile_execution = False + self.profile_path_prefix = tempfile.gettempdir() + self.use_pre_allocated_outputs = False + self.use_output_allocator_outputs = False + self.output_tensors_are_unowned = False + self.output_allocator = None + self.pre_allocated_outputs = [] + self._input_buffers = [] + self._output_buffers = [] + self._caller_stream = None + self._engine_stream = None + self.cudagraph = None + self.shape_key = None + self._empty_tensor_placeholder = None + self._dynamic_workspace = None + self.runtime_states = TorchTRTRuntimeStates( + torch_tensorrt.runtime.get_cudagraphs_mode() + ) + self.resource_allocation_strategy = 0 + self._runtime_config = None + # NCCL communicators cannot be pickled; rebind lazily on the next + # forward pass via setup_nccl_comm(). + self._nccl_comm = None + + serialized_info = list(state[0]) + engine_field = serialized_info[ENGINE_IDX] + if isinstance(engine_field, str): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) + elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) + self._load_serialized_info(serialized_info) + self._setup_engine() + + def tracing_mode(self) -> str: + """Return ``"real"`` so FakeTensor/export pass the real engine into meta kernels. + + Mirrors TorchBind ``tracing_with_real`` behavior (see + :func:`torch._library.fake_class_registry.maybe_to_fake_obj`). + """ + + return "real" + + def _load_serialized_info( + self, serialized_info: SerializedTensorRTEngineFmt + ) -> None: + if len(serialized_info) != SERIALIZATION_LEN: + raise RuntimeError( + f"Expected serialized info length {SERIALIZATION_LEN}, got {len(serialized_info)}" + ) + + self.serialized_info: SerializedTensorRTEngineFmt = list(serialized_info) + self.version = str(self.serialized_info[ABI_TARGET_IDX]) + self.name = str(self.serialized_info[NAME_IDX]).replace(".", "_") + self.serialized_device_info = str(self.serialized_info[DEVICE_IDX]) + self.serialized_engine = self.serialized_info[ENGINE_IDX] + if not isinstance(self.serialized_engine, (bytes, bytearray)): + raise TypeError("Expected serialized engine as bytes") + + self.in_binding_names = deserialize_binding_names( + str(self.serialized_info[INPUT_BINDING_NAMES_IDX]) + ) + self.out_binding_names = deserialize_binding_names( + str(self.serialized_info[OUTPUT_BINDING_NAMES_IDX]) + ) + self.hardware_compatible = bool(int(self.serialized_info[HW_COMPATIBLE_IDX])) + self.serialized_metadata = str(self.serialized_info[SERIALIZED_METADATA_IDX]) + self.serialized_target_platform = str(self.serialized_info[TARGET_PLATFORM_IDX]) + self.requires_output_allocator = bool( + int(self.serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) + ) + self.resource_allocation_strategy = int( + self.serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] + ) + # Mirrors the C++ TRTEngine::requires_native_multidevice field; consumed + # by TorchTensorRTModule.setup_engine() and the distributed helpers. + self.requires_native_multidevice = bool( + int(self.serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]) + ) + # Internal alias used by the NCCL setup paths (matches the original + # _PythonTorchTensorRTModule attribute name). + self._has_nccl_ops: bool = self.requires_native_multidevice + + metadata = self.decode_metadata(self.serialized_metadata) + self.settings = metadata.get("settings", CompilationSettings()) + self.weight_name_map = metadata.get("weight_name_map") + self.symbolic_shape_expressions = metadata.get("inout_symexprs") + self.output_tensors_are_unowned = metadata.get( + "output_tensors_are_unowned", False + ) + + device_info = parse_device_info(self.serialized_device_info) + self.target_device_id = device_info["id"] + # Serialized major/minor/name only — not ``_CudaDeviceProperties`` — so deepcopy/refit + # can copy the owning ``GraphModule`` without pickle errors. + self.target_device_properties = SimpleNamespace( + major=device_info["major"], + minor=device_info["minor"], + name=device_info["name"], + ) + + @staticmethod + def decode_metadata(encoded_metadata: str) -> Any: + dumped_metadata = base64.b64decode(encoded_metadata.encode("utf-8")) + return pickle.loads(dumped_metadata) + + def get_serialized_metadata(self) -> str: + return self.serialized_metadata + + def close(self) -> None: + """Release CUDA graph resources (called explicitly or via __del__).""" + self.reset_captured_graph() + + def _create_execution_context(self) -> trt.IExecutionContext: + strategy = trt.ExecutionContextAllocationStrategy.STATIC + if self.resource_allocation_strategy: + strategy = trt.ExecutionContextAllocationStrategy.USER_MANAGED + context = self.cuda_engine.create_execution_context(strategy) + assert context is not None, "Failed to create execution context" + return context + + def _setup_engine(self) -> None: + multi_gpu_device_check() + self.runtime = trt.Runtime(TRT_LOGGER) + self.cuda_engine = self.runtime.deserialize_cuda_engine(self.serialized_engine) + assert self.cuda_engine is not None, "Failed to deserialize TensorRT engine" + + if self.cuda_engine.streamable_weights_size > 0: + budget_bytes = self.cuda_engine.get_weight_streaming_automatic_budget() + logger.debug(f"Weight streaming budget set to {budget_bytes}B") + self.cuda_engine.weight_streaming_budget_v2 = budget_bytes + + self.context = self._create_execution_context() + + if self._has_nccl_ops: + from torch_tensorrt.distributed._nccl_utils import ( + check_nccl_engine_requirements, + ) + + check_nccl_engine_requirements() + + # For engines with native NCCL collective layers, all ranks must + # have a live IExecutionContext before any rank executes a + # collective. Barrier here so a fast-compiling rank does not race + # ahead and issue an NCCL op while another rank is still inside + # deserialize_cuda_engine / create_execution_context. + if ( + dist.is_available() + and dist.is_initialized() + and dist.get_world_size() > 1 + ): + logger.debug( + "Barrier after execution context creation (distributed NCCL engine)" + ) + dist.barrier() + + if not self.in_binding_names and not self.out_binding_names: + input_names: List[str] = [] + output_names: List[str] = [] + for idx in range(self.cuda_engine.num_io_tensors): + bind_name = self.cuda_engine.get_tensor_name(idx) + if ( + self.cuda_engine.get_tensor_mode(bind_name) + == trt.TensorIOMode.INPUT + ): + input_names.append(bind_name) + else: + output_names.append(bind_name) + self.in_binding_names = input_names + self.out_binding_names = output_names + + self._input_buffers = [None] * len(self.in_binding_names) + self._output_buffers = [None] * len(self.out_binding_names) + self.input_dtypes = [ + dtype._from(self.cuda_engine.get_tensor_dtype(input_name)).to(torch.dtype) + for input_name in self.in_binding_names + ] + self.output_dtypes = [ + dtype._from(self.cuda_engine.get_tensor_dtype(output_name)).to(torch.dtype) + for output_name in self.out_binding_names + ] + self.output_shapes = [ + self.cuda_engine.get_tensor_shape(output_name) + for output_name in self.out_binding_names + ] + self.is_shape_inference_io = { + input_name: self.cuda_engine.is_shape_inference_io(input_name) + for input_name in self.in_binding_names + } + if self.requires_output_allocator: + self.create_output_allocator() + + # --- distributed / NCCL --- + + @property + def is_distributed(self) -> bool: + """Check if this engine is running inside an active distributed context.""" + return bool( + dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1 + ) + + def setup_nccl_comm(self) -> None: + """Set up NCCL communicator from the active ProcessGroup. + + Uses the process group set by torch_tensorrt.distributed.distributed_context() if + active, otherwise falls back to the default world group. + Called lazily on first forward pass for distributed engines. + """ + from torch_tensorrt.distributed._distributed import get_active_group + + if not self.is_distributed: + return + + pg = get_active_group() + if pg is None or dist.get_backend(pg) != "nccl": + raise RuntimeError( + "Active ProcessGroup must use NCCL backend. " + "Use torch_tensorrt.distributed.distributed_context(group) to select a non-default group." + ) + + backend = pg._get_backend(torch.device("cuda")) + + # Force NCCL communicator initialization with a dummy collective. + # Must use group=pg so the correct group's comm is initialized; + # dist.all_reduce without group= only initializes the default world group. + dummy = torch.zeros(1, device="cuda") + dist.all_reduce(dummy, group=pg) + + comm_ptr = backend._comm_ptr() + if comm_ptr is None or comm_ptr == 0: + raise RuntimeError("Failed to get NCCL communicator from ProcessGroup") + + self._nccl_comm = comm_ptr + + # Bind communicator to TRT execution context (PyCapsule required by TRT Python API) + if self.context is not None: + import ctypes + + ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object + ctypes.pythonapi.PyCapsule_New.argtypes = [ + ctypes.c_void_p, + ctypes.c_char_p, + ctypes.c_void_p, + ] + comm_capsule = ctypes.pythonapi.PyCapsule_New(comm_ptr, None, None) + ok = self.context.set_communicator(comm_capsule) + if not ok: + raise RuntimeError( + f"TRT context.set_communicator() returned False for rank={dist.get_rank()}. " + f"comm_ptr={comm_ptr:#x}. Failed to bind NCCL communicator to TRT execution context." + ) + + logger.info( + f"NCCL comm set up (rank={dist.get_rank()}, world_size={dist.get_world_size()})" + ) + + # --- weight streaming (mirrors C++ engine surface) --- + + @property + def streamable_device_memory_budget(self) -> Any: + return self.cuda_engine.streamable_weights_size + + @property + def automatic_device_memory_budget(self) -> Any: + return self.cuda_engine.get_weight_streaming_automatic_budget() + + @property + def device_memory_budget(self) -> Any: + return self.cuda_engine.weight_streaming_budget_v2 + + @device_memory_budget.setter + def device_memory_budget(self, budget_bytes: int) -> None: + if budget_bytes < 0: + budget_bytes = self.streamable_device_memory_budget + self.cuda_engine.weight_streaming_budget_v2 = budget_bytes + if self.cuda_engine.weight_streaming_budget_v2 != budget_bytes: + logger.error(f"Failed to set weight streaming budget to {budget_bytes}") + self.context = self._create_execution_context() + self.runtime_states.context_changed = True + + def reset_captured_graph(self) -> None: + if self.cudagraph: + self.cudagraph.reset() + self.cudagraph = None + + def use_dynamically_allocated_resources(self, dynamic: bool = False) -> None: + new_strategy = 1 if dynamic else 0 + if self.resource_allocation_strategy == new_strategy: + return + self.resource_allocation_strategy = new_strategy + self.context = self._create_execution_context() + self.runtime_states.context_changed = True + + def set_output_tensors_as_unowned(self, enabled: bool) -> None: + self.output_tensors_are_unowned = enabled + + def are_output_tensors_unowned(self) -> bool: + return self.output_tensors_are_unowned + + # --- profiling / inspection --- + + def enable_profiling(self) -> None: + if not self.context.profiler: + self.context.profiler = trt.Profiler() + self._profile_execution = True + + def set_profile_format(self, profile_format: str) -> None: + if profile_format not in ["cudagraph", "trex", "perfetto"]: + raise ValueError(f"Invalid profile format: {profile_format}") + + def disable_profiling(self) -> None: + torch.cuda.synchronize() + self.context = self._create_execution_context() + self._profile_execution = False + self.runtime_states.context_changed = True + + def get_engine_layer_info(self) -> str: + inspector = self.cuda_engine.create_engine_inspector() + return str(inspector.get_engine_information(trt.LayerInformationFormat.JSON)) + + def dump_engine_layer_info(self) -> None: + print(self.get_engine_layer_info()) + + def dump_engine_layer_info_to_file(self, path: str) -> None: + with open(path, "w") as f: + f.write(self.get_engine_layer_info()) + + def infer_outputs(self, input_shapes: List[Any]) -> List[Any]: + """Return output shapes inferred for the given input shapes.""" + results = [] + for i, input_name in enumerate(self.in_binding_names): + if i < len(input_shapes): + self.context.set_input_shape(input_name, tuple(input_shapes[i])) + for output_name in self.out_binding_names: + results.append(tuple(self.context.get_tensor_shape(output_name))) + return results + + # --- tensor binding helpers --- + + def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: + new_shape_key = "".join(str(tuple(t.shape)).replace(" ", "") for t in inputs) + if new_shape_key != self.shape_key: + logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}") + self.shape_key = new_shape_key + return True + return False + + def create_output_allocator(self) -> None: + if self.output_allocator is None: + self.output_allocator = DynamicOutputAllocator( + { + name: self.output_dtypes[idx] + for idx, name in enumerate(self.out_binding_names) + } + ) + + def create_output_tensors(self) -> List[torch.Tensor]: + return [ + torch.empty( + size=self.output_shapes[idx], + dtype=self.output_dtypes[idx], + device=torch.cuda.current_device(), + ) + for idx, _ in enumerate(self.out_binding_names) + ] + + def setup_input_tensors( + self, + contiguous_inputs: List[torch.Tensor], + cudagraphs_enabled: bool, + need_cudagraphs_record: bool, + ) -> None: + for i, input_name in enumerate(self.in_binding_names): + + assert ( + contiguous_inputs[i].dtype == self.input_dtypes[i] + ), f"Dtype mismatch for input {input_name}. Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." + + if need_cudagraphs_record: + self._input_buffers[i] = contiguous_inputs[i].clone() + + if self.is_shape_inference_io[input_name]: + inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() + self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) + else: + self.context.set_input_shape( + input_name, tuple(contiguous_inputs[i].shape) + ) + tensor_to_bind = contiguous_inputs[i] + if tensor_to_bind.numel() == 0: + if self._empty_tensor_placeholder is None: + self._empty_tensor_placeholder = torch.empty( + 1, + dtype=tensor_to_bind.dtype, + device=torch.cuda.current_device(), + ) + tensor_to_bind = self._empty_tensor_placeholder + + if cudagraphs_enabled: + self._input_buffers[i].copy_(contiguous_inputs[i]) + self.context.set_tensor_address( + input_name, self._input_buffers[i].data_ptr() + ) + else: + self.context.set_tensor_address( + input_name, tensor_to_bind.data_ptr() + ) + + def _profile_section(self, label: str) -> ContextManager[None]: + if self._profile_execution: + return cast( + ContextManager[None], + torch.autograd.profiler.record_function(label), + ) + return nullcontext() + + # --- execution --- + + def _execute_standard( + self, contiguous_inputs: List[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + shape_changed = self.validate_input_shapes(contiguous_inputs) + ( + need_cudagraphs_record, + can_use_pre_allocated_outputs, + need_cudagraphs_reset, + ) = self.runtime_states.set_runtime_states( + torch_tensorrt.runtime.get_cudagraphs_mode(), + self.use_pre_allocated_outputs, + shape_changed, + ) + + if need_cudagraphs_reset: + self.reset_captured_graph() + + if need_cudagraphs_record: + self._input_buffers = [None] * len(self.in_binding_names) + self._output_buffers = [None] * len(self.out_binding_names) + + with self._profile_section("TRTEngine:ProcessInputs"): + self.setup_input_tensors( + contiguous_inputs, + torch_tensorrt.runtime.get_cudagraphs_mode(), + need_cudagraphs_record, + ) + if shape_changed: + uninferred_input_names = self.context.infer_shapes() + if uninferred_input_names: + logger.warning( + f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior." + ) + + with self._profile_section("TRTEngine:ProcessOutputs"): + if can_use_pre_allocated_outputs: + outputs = self.pre_allocated_outputs + else: + self.output_shapes = [ + tuple(self.context.get_tensor_shape(output_name)) + for output_name in self.out_binding_names + ] + if any(-1 in shape for shape in self.output_shapes): + raise ValueError( + "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." + ) + outputs = self.create_output_tensors() + + for o, output_name in enumerate(self.out_binding_names): + if need_cudagraphs_record: + self._output_buffers[o] = outputs[o].clone() + if torch_tensorrt.runtime.get_cudagraphs_mode(): + self.context.set_tensor_address( + output_name, self._output_buffers[o].data_ptr() + ) + else: + self.context.set_tensor_address(output_name, outputs[o].data_ptr()) + + with self._profile_section("TRTEngine:TensorRTRuntime"): + self._caller_stream = torch.cuda.current_stream() + if ( + self._engine_stream == torch.cuda.default_stream() + or self._engine_stream is None + ): + self._engine_stream = torch.cuda.Stream() + + self._engine_stream.wait_stream(self._caller_stream) + with torch.cuda.stream(self._engine_stream): + if self.resource_allocation_strategy: + self._dynamic_workspace = torch.empty( + self.cuda_engine.device_memory_size_v2, + dtype=torch.uint8, + device=torch.cuda.current_device(), + ) + self.context.set_device_memory(self._dynamic_workspace.data_ptr()) + + if torch_tensorrt.runtime.get_cudagraphs_mode(): + if need_cudagraphs_record: + self.cudagraph = torch.cuda.CUDAGraph() + if self._profile_execution: + self.cudagraph.enable_debug_mode() + with torch.cuda.graph( + self.cudagraph, stream=self._engine_stream + ): + self.context.execute_async_v3( + self._engine_stream.cuda_stream + ) + if self._profile_execution: + self.cudagraph.debug_dump( + f"{DEBUG_LOGGING_DIR}/{self.name}_cudagraph.dot" + ) + self.cudagraph.replay() # type: ignore[union-attr] + else: + self.context.execute_async_v3(self._engine_stream.cuda_stream) + + self._caller_stream.wait_stream(self._engine_stream) + + if self.use_pre_allocated_outputs and ( + self.output_tensors_are_unowned + or not self.pre_allocated_outputs + or shape_changed + ): + self.pre_allocated_outputs = self.create_output_tensors() + + if torch_tensorrt.runtime.get_cudagraphs_mode(): + for idx, output in enumerate(outputs): + output.copy_(self._output_buffers[idx]) + + if len(outputs) == 1: + return outputs[0] + return tuple(outputs) + + def _execute_output_allocator( + self, contiguous_inputs: List[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + if torch_tensorrt.runtime.get_cudagraphs_mode(): + raise RuntimeError( + "Both CUDA Graphs and dynamic output allocation are enabled, which are " + "incompatible runtime modes. Please disable one of the two." + ) + + with self._profile_section("TRTEngine:ProcessInputs"): + self.setup_input_tensors(contiguous_inputs, False, False) + + with self._profile_section("TRTEngine:SetupOutputAllocator"): + self.create_output_allocator() + for output_name in self.out_binding_names: + if not self.context.set_output_allocator( + output_name, self.output_allocator + ): + raise RuntimeError( + f"Failed to set output allocator for {output_name}" + ) + + with self._profile_section("TRTEngine:TensorRTRuntime"): + self._caller_stream = torch.cuda.current_stream() + if ( + self._engine_stream == torch.cuda.default_stream() + or self._engine_stream is None + ): + self._engine_stream = torch.cuda.Stream() + + self._engine_stream.wait_stream(self._caller_stream) + with torch.cuda.stream(self._engine_stream): + self.context.execute_async_v3(self._engine_stream.cuda_stream) + self._caller_stream.wait_stream(self._engine_stream) + + outputs = [] + assert self.output_allocator is not None + for idx, output_name in enumerate(self.out_binding_names): + shape = self.output_allocator.shapes.get(output_name, None) + dtype_ = self.output_dtypes[idx] + buffer_tensor = self.output_allocator.buffers.get(output_name) + assert buffer_tensor is not None + output = buffer_tensor.clone().detach() + prod = int(torch.prod(torch.tensor(shape))) + output = output.reshape(-1).view(dtype_)[:prod].reshape(shape) + outputs.append(output) + + if len(outputs) == 1: + return outputs[0] + return tuple(outputs) + + def execute( + self, inputs: Sequence[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + contiguous_inputs = [tensor.contiguous() for tensor in inputs] + + if self._has_nccl_ops and self._nccl_comm is None: + nccl_type = ( + "native TRT collectives" + if ENABLED_FEATURES.native_trt_collectives + else ( + "TRT-LLM NCCL plugins" + if ENABLED_FEATURES.trtllm_for_nccl + else "unknown backend" + ) + ) + logger.info( + f"Setting up NCCL for distributed execution using {nccl_type} " + f"(rank={dist.get_rank()}, world_size={dist.get_world_size()})" + ) + self.setup_nccl_comm() + logger.info(f"NCCL setup complete, comm={self._nccl_comm}") + + if torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE: + curr_device_id = torch.cuda.current_device() + curr_device_properties = torch.cuda.get_device_properties(curr_device_id) + if _is_switch_required( + curr_device_id, + self.target_device_id, + curr_device_properties, + self.target_device_properties, + ): + device_id, _ = _select_rt_device( + curr_device_id, + self.target_device_id, + self.target_device_properties, + ) + device = torch.device(device_id) + torch.cuda.set_device(device_id) + contiguous_inputs = [tensor.to(device) for tensor in contiguous_inputs] + logger.warning(f"Moved all input Tensors to cuda:{device_id}") + + if self.requires_output_allocator or self.use_output_allocator_outputs: + logger.debug("Using the dynamic allocator runtime mode.") + return self._execute_output_allocator(contiguous_inputs) + + logger.debug( + f"Using the standard execution runtime mode with cudagraphs={torch_tensorrt.runtime.get_cudagraphs_mode()}." + ) + return self._execute_standard(contiguous_inputs) + + +# register_opaque_type(EngineSerializer, typ="reference") + + +register_opaque_type(TRTEngine, typ="reference") + + +@torch.library.custom_op( # type: ignore[misc] + "tensorrt::execute_engine_python", mutates_args=() +) +def execute_engine_python( + input_tensors: List[torch.Tensor], engine: TRTEngine +) -> List[torch.Tensor]: + outputs = engine.execute(input_tensors) + return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) + + +@execute_engine_python.register_fake # type: ignore[misc] +def execute_engine_python_fake( + input_tensors: List[torch.Tensor], engine: TRTEngine +) -> List[torch.Tensor]: + """Abstract/fake kernel for ``tensorrt::execute_engine``. + + Called by FakeTensor propagation and ``torch.export`` to infer output + shapes and dtypes without executing the real TRT engine. Output shapes + are obtained by asking the engine's execution context to propagate the + concrete input shapes symbolically; dtypes come from the engine's + pre-parsed output dtype list. + """ + input_shapes = [list(t.shape) for t in input_tensors] + try: + output_shapes = engine.infer_outputs(input_shapes) + except Exception: + # Fall back to the statically-stored shapes when shape inference is + # unavailable (e.g. engine context not yet initialised in meta mode). + output_shapes = [list(s) for s in engine.output_shapes] + + return [ + torch.empty( + shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device + ) + for i, shape in enumerate(output_shapes) + ] diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 833fdee639..0386c97ea3 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -4,81 +4,53 @@ import copy import logging import pickle -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import Platform -from torch_tensorrt._features import ( - ENABLED_FEATURES, - for_all_methods, - needs_torch_tensorrt_runtime, -) +from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + ABI_TARGET_IDX, + ABI_VERSION, + DEVICE_IDX, + ENGINE_IDX, + HW_COMPATIBLE_IDX, + INPUT_BINDING_NAMES_IDX, + NAME_IDX, + OUTPUT_BINDING_NAMES_IDX, + REQUIRES_NATIVE_MULTIDEVICE_IDX, + REQUIRES_OUTPUT_ALLOCATOR_IDX, + RESOURCE_ALLOCATION_STRATEGY_IDX, + SERIALIZATION_LEN, + SERIALIZED_METADATA_IDX, + TARGET_PLATFORM_IDX, + SerializedTensorRTEngineFmt, + serialize_binding_names, + serialize_device_info, +) logger = logging.getLogger(__name__) -SerializedTensorRTEngineFmt = List[ - Union[str, bytes] -] # Aligned with //core/runtime/register_jit_hooks.cpp SerializedTorchTensorRTModuleFmt = Tuple[ - str, Optional[SerializedTensorRTEngineFmt], List[str], List[str] + str, + Optional[SerializedTensorRTEngineFmt], + List[str], + List[str], ] -ABI_TARGET_IDX = -1 # Not implemented -NAME_IDX = -1 # Not implemented -DEVICE_IDX = -1 # Not implemented -ENGINE_IDX = -1 # Not implemented -INPUT_BINDING_NAMES_IDX = -1 # Not implemented -OUTPUT_BINDING_NAMES_IDX = -1 # Not implemented -HW_COMPATIBLE_IDX = -1 # Not implemented -SERIALIZED_METADATA_IDX = -1 # Not implemented -TARGET_PLATFORM_IDX = -1 # Not implemented -REQUIRES_OUTPUT_ALLOCATOR_IDX = -1 # Not implemented -SERIALIZATION_LEN = -1 # Not implemented -REQUIRES_NATIVE_MULTIDEVICE_IDX = -1 # Not implemented - -if ENABLED_FEATURES.torch_tensorrt_runtime: - ABI_TARGET_IDX = torch.ops.tensorrt.ABI_TARGET_IDX() # 0 - NAME_IDX = torch.ops.tensorrt.NAME_IDX() # 1 - DEVICE_IDX = torch.ops.tensorrt.DEVICE_IDX() # 2 - ENGINE_IDX = torch.ops.tensorrt.ENGINE_IDX() # 3 - INPUT_BINDING_NAMES_IDX = torch.ops.tensorrt.INPUT_BINDING_NAMES_IDX() # 4 - OUTPUT_BINDING_NAMES_IDX = torch.ops.tensorrt.OUTPUT_BINDING_NAMES_IDX() # 5 - HW_COMPATIBLE_IDX = torch.ops.tensorrt.HW_COMPATIBLE_IDX() # 6 - SERIALIZED_METADATA_IDX = torch.ops.tensorrt.SERIALIZED_METADATA_IDX() # 7 - TARGET_PLATFORM_IDX = torch.ops.tensorrt.TARGET_PLATFORM_IDX() # 8 - REQUIRES_OUTPUT_ALLOCATOR_IDX = ( - torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX() - ) # 9 - RESOURCE_ALLOCATION_STRATEGY_IDX = ( - torch.ops.tensorrt.RESOURCE_ALLOCATION_STRATEGY_IDX() - ) # 10 - REQUIRES_NATIVE_MULTIDEVICE_IDX = ( - torch.ops.tensorrt.REQUIRES_NATIVE_MULTIDEVICE_IDX() - ) # 11 - SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN() # 12 - - -@for_all_methods(needs_torch_tensorrt_runtime) -class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] - """TorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - - This module is backed by the Torch-TensorRT runtime and is fully compatible with both - FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as - well as TorchScript / C++ deployments since TorchTensorRTModule can be passed to ``torch.jit.trace`` - and then saved. - The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where - the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))`` +class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] + """``nn.Module`` that runs a TensorRT engine inside PyTorch. - > Note: TorchTensorRTModule only supports engines built with explicit batch + When the C++ Torch-TensorRT runtime is available, execution uses + ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. + When only the Python runtime is available, a Python :class:`TRTEngine` is + registered under the same ``tensorrt::execute_engine`` op so that the same + compiled graph works with either runtime transparently. - Attributes: - name (str): Name of module (for easier debugging) - engine (torch.classes.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling - input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules - output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned + Supports ``torch.save`` / ``torch.load`` via ``get_extra_state`` / ``set_extra_state``. """ def __init__( @@ -88,7 +60,7 @@ def __init__( output_binding_names: Optional[List[str]] = None, *, name: str = "", - settings: CompilationSettings = CompilationSettings(), # Assumes engine was built with default compilation settings if object not passed + settings: CompilationSettings = CompilationSettings(), weight_name_map: Optional[dict[Any, Any]] = None, requires_output_allocator: bool = False, requires_native_multidevice: bool = False, @@ -131,11 +103,17 @@ def __init__( settings=CompilationSettings(device=torch.cuda.current_device) ) + Args: + serialized_engine: Raw TRT engine bytes (``None`` if restoring state only). + input_binding_names: Input tensor names in ``forward`` order. + output_binding_names: Output tensor names in return order. + name: Logical name for logging and serialization. + settings: Compilation/runtime settings (device, lazy init, cross-compile, etc.). + weight_name_map: Engine weight name to ``state_dict`` key mapping (refit). + requires_output_allocator: Engine needs TRT dynamic output allocation. + symbolic_shape_expressions: Optional symbolic shape metadata from compile. """ - super(TorchTensorRTModule, self).__init__() - - if not isinstance(serialized_engine, bytearray): - ValueError("Expected serialized engine as bytearray") + super().__init__() self.input_binding_names = ( input_binding_names if input_binding_names is not None else [] @@ -148,11 +126,21 @@ def __init__( self.settings = copy.deepcopy(settings) self.weight_name_map = weight_name_map self.serialized_engine = serialized_engine - self.engine = None + self.engine: Any = None + self._use_python_runtime = settings.use_python_runtime + + self.execute_engine_op: Any = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources self.symbolic_shape_expressions = symbolic_shape_expressions self.requires_native_multidevice = requires_native_multidevice + self.target_platform = ( + Platform.current_platform() + if not self.settings.enable_cross_compile_for_windows + else Platform.WIN_X86_64 + ) + self.profiling_enabled = False + self.target_device = self._resolve_target_device() if ( serialized_engine @@ -176,6 +164,12 @@ def __deepcopy__(self, memo: dict[int, Any]) -> "TorchTensorRTModule": object.__setattr__(result, k, copy.deepcopy(v, memo)) return result + def _resolve_target_device(self) -> torch.device: + """Resolve the engine's target CUDA device from compilation settings.""" + if self.settings.device is not None: + return torch.device(f"cuda:{self.settings.device.gpu_id}") + return torch.device(f"cuda:{torch.cuda.current_device()}") + def _pack_engine_info(self) -> List[str | bytes]: target_device = ( self.settings.device @@ -192,30 +186,36 @@ def _pack_engine_info(self) -> List[str | bytes]: else self.engine.are_output_tensors_unowned() ), } - target_platform = ( - Platform.current_platform() - if not self.settings.enable_cross_compile_for_windows - else Platform.WIN_X86_64 - ) # Change to match target for engine engine_info: List[str | bytes] = [""] * SERIALIZATION_LEN - engine_info[ABI_TARGET_IDX] = torch.ops.tensorrt.ABI_VERSION() + engine_info[ABI_TARGET_IDX] = ( + torch.ops.tensorrt.ABI_VERSION() + if ENABLED_FEATURES.torch_tensorrt_runtime + else ABI_VERSION + ) engine_info[NAME_IDX] = ( self.name + "_engine" if self.name != "" else "tensorrt_engine" ) - engine_info[DEVICE_IDX] = target_device._to_serialized_rt_device() - assert self.serialized_engine + engine_info[DEVICE_IDX] = ( + target_device._to_serialized_rt_device() + if ENABLED_FEATURES.torch_tensorrt_runtime + else serialize_device_info(target_device) + ) + assert self.serialized_engine is not None engine_info[ENGINE_IDX] = self.serialized_engine - - engine_info[INPUT_BINDING_NAMES_IDX] = TorchTensorRTModule._pack_binding_names( + engine_info[INPUT_BINDING_NAMES_IDX] = serialize_binding_names( self.input_binding_names ) - engine_info[OUTPUT_BINDING_NAMES_IDX] = TorchTensorRTModule._pack_binding_names( + engine_info[OUTPUT_BINDING_NAMES_IDX] = serialize_binding_names( self.output_binding_names ) engine_info[HW_COMPATIBLE_IDX] = str(int(self.hardware_compatible)) engine_info[SERIALIZED_METADATA_IDX] = self.encode_metadata(metadata) - engine_info[TARGET_PLATFORM_IDX] = target_platform._to_serialized_rt_platform() + engine_info[TARGET_PLATFORM_IDX] = ( + self.target_platform._to_serialized_rt_platform() + if ENABLED_FEATURES.torch_tensorrt_runtime + else str(self.target_platform) + ) engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str( int(self.requires_output_allocator) ) @@ -242,7 +242,6 @@ def get_device_memory_budget(self) -> Any: return self.engine.device_memory_budget def set_device_memory_budget(self, budget_bytes: int) -> int: - # Disable weight streaming for invalid budget size if budget_bytes < 0: budget_bytes = self.get_streamable_device_memory_budget() self.engine.device_memory_budget = budget_bytes @@ -251,7 +250,6 @@ def set_device_memory_budget(self, budget_bytes: int) -> int: budget_bytes = self.engine.device_memory_budget if self.get_streamable_device_memory_budget() == budget_bytes: logger.warning("Weight streaming is disabled") - return budget_bytes def _reset_captured_graph(self) -> None: @@ -276,7 +274,18 @@ def setup_engine(self) -> None: """ if self.engine is not None: return - self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + + if self._use_python_runtime: + from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine + + self.engine = TRTEngine( + self._pack_engine_info(), + profile_execution=self.profiling_enabled, + ) + self.execute_engine_op = torch.ops.tensorrt.execute_engine_python + else: + self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + self.execute_engine_op = torch.ops.tensorrt.execute_engine # requires_native_multidevice is set by the C++ constructor from the serialized REQUIRES_NATIVE_MULTIDEVICE_IDX field. if self.engine.requires_native_multidevice: @@ -322,16 +331,19 @@ def decode_metadata(encoded_metadata: bytes) -> Any: def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: if self.engine: + engine_info = self._pack_engine_info() + assert isinstance(engine_info[ENGINE_IDX], (bytes, bytearray)) + engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) return ( self.name, - self.engine.__getstate__(), + engine_info, self.input_binding_names, self.output_binding_names, ) elif self.serialized_engine: engine_info = self._pack_engine_info() - assert isinstance(engine_info[3], bytes) - engine_info[ENGINE_IDX] = base64.b64encode(engine_info[3]) + assert isinstance(engine_info[ENGINE_IDX], bytes) + engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) return ( self.name, engine_info, @@ -350,11 +362,10 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.name = state[0] if state[1] is not None: - serialized_engine_info: SerializedTensorRTEngineFmt = state[1] + serialized_engine_info: SerializedTensorRTEngineFmt = list(state[1]) serialized_engine_info[ENGINE_IDX] = base64.b64decode( serialized_engine_info[ENGINE_IDX] ) - self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) self.hardware_compatible = bool( int(serialized_engine_info[HW_COMPATIBLE_IDX]) ) @@ -367,39 +378,62 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) self.settings = metadata["settings"] self.weight_name_map = metadata["weight_name_map"] - self.output_tensors_are_unowned = metadata["output_tensors_are_unowned"] self.symbolic_shape_expressions = metadata["inout_symexprs"] - self.engine.set_output_tensors_as_unowned(self.output_tensors_are_unowned) + # Re-resolve the runtime now that we have the loaded settings: the + # original __init__ kwarg may have been False, but a saved engine + # can still pin use_python_runtime=True via the settings blob. + self._use_python_runtime = ( + getattr(self.settings, "use_python_runtime", False) + or not ENABLED_FEATURES.torch_tensorrt_runtime + ) + if self._use_python_runtime: + from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine + + self.engine = TRTEngine(serialized_engine_info) + self.execute_engine_op = torch.ops.tensorrt.execute_engine_python + else: + self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + self.execute_engine_op = torch.ops.tensorrt.execute_engine + + self.engine.set_output_tensors_as_unowned( + metadata["output_tensors_are_unowned"] + ) else: self.engine = None + self.execute_engine_op = None self.settings = CompilationSettings() self.hardware_compatible = False self.input_binding_names = state[2] self.output_binding_names = state[3] + self.target_device = self._resolve_target_device() def set_pre_allocated_outputs(self, enable: bool) -> None: self.engine.use_pre_allocated_outputs = enable + @property + def pre_allocated_outputs(self) -> Any: + """Pre-allocated output tensors currently held by the underlying engine.""" + if self.engine is None: + return [] + return getattr(self.engine, "pre_allocated_outputs", []) + def set_use_output_allocator(self, enable: bool) -> None: self.engine.use_output_allocator_outputs = enable def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: - """Implementation of the forward pass for a TensorRT engine - - Args: - *inputs (Union[torch.Tensor, int]): Inputs to the forward function - - Returns: - torch.Tensor or Tuple(torch.Tensor): Result of the engine computation - """ + """Run the TensorRT engine on GPU tensors (non-tensor args are cast to CUDA tensors).""" if self.engine is None: raise RuntimeError("Engine has not been setup yet.") - assert len(inputs) == len( - self.input_binding_names - ), f"Wrong number of inputs, expected {len(self.input_binding_names)} got {len(inputs)}." + target = self.target_device + binding_names = self.input_binding_names + # len-check inlined (cheaper than keeping an f-string around the hot path) + if len(inputs) != len(binding_names): + raise AssertionError( + f"Wrong number of inputs, expected {len(binding_names)} got {len(inputs)}." + ) # If the inputs are not Torch Tensors, which can occur in scenarios such as shape tensors # which are outputs of a preceding Torch subgraph (where the Dynamic input may be an integer) @@ -420,10 +454,12 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: else: input_tensors.append(torch.tensor(i).cuda()) - outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine( - list(input_tensors), self.engine - ) + if self.execute_engine_op is None: + raise RuntimeError( + "execute_engine op has not been bound. Call setup_engine() first." + ) + outputs = self.execute_engine_op(input_tensors, self.engine) if len(outputs) == 1: return outputs[0] @@ -434,34 +470,30 @@ def enable_profiling( profiling_results_dir: Optional[str] = None, profile_format: str = "perfetto", ) -> None: - """Enable the profiler to collect latency information about the execution of the engine - - Traces can be visualized using https://ui.perfetto.dev/ or compatible alternatives - - Keyword Arguments: - profiling_results_dir (str): Absolute path to the directory to sort results of profiling. - """ + """Enable engine profiling (optional path prefix and format for tracing output).""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") if profiling_results_dir is not None: self.engine.profile_path_prefix = profiling_results_dir - assert profile_format in ["trex", "perfetto"] + self.engine.enable_profiling() - self.engine.set_profile_format(profile_format) + if hasattr(self.engine, "set_profile_format"): + self.engine.set_profile_format(profile_format) + self.profiling_enabled = True def set_output_tensors_as_unowned(self, enabled: bool) -> None: self.engine.set_output_tensors_as_unowned(enabled) def are_output_tensors_unowned(self) -> bool: - return self.engine.are_output_tensors_unowned() # type: ignore[no-any-return] + return bool(self.engine.are_output_tensors_unowned()) def disable_profiling(self) -> None: - """Disable the profiler""" + """Disable engine profiling and clear the profiling flag on this module.""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") - self.engine.disable_profiling() + self.profiling_enabled = False def get_layer_info(self) -> str: """Get a JSON string containing the layer information encoded by the TensorRT engine in this module @@ -482,9 +514,3 @@ def dump_layer_info(self) -> None: raise RuntimeError("Engine has not been initialized yet.") self.engine.dump_engine_layer_info() - - @staticmethod - def _pack_binding_names(binding_names: List[str]) -> str: - delim = torch.ops.tensorrt.SERIALIZED_ENGINE_BINDING_DELIM()[0] - packed_bindings: str = delim.join(binding_names) - return packed_bindings diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py index 0eb66b24b0..93576ec0dd 100644 --- a/py/torch_tensorrt/dynamo/runtime/__init__.py +++ b/py/torch_tensorrt/dynamo/runtime/__init__.py @@ -1,7 +1,4 @@ import torch_tensorrt -from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( # noqa: F401 - PythonTorchTensorRTModule, -) from torch_tensorrt.dynamo.runtime._ResourceAllocator import ( # noqa: F401 ResourceAllocationStrategy, ) diff --git a/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py new file mode 100644 index 0000000000..d4f31ba8a8 --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py @@ -0,0 +1,144 @@ +"""Serialized TensorRT engine blob layout shared by C++ and Python runtimes. + +Field order and indices must stay aligned with ``core/runtime/runtime.h`` and +``register_jit_hooks.cpp`` (``torch.ops.tensorrt.*``). When the C++ runtime is +loaded, :func:`_assert_serialized_layout_matches_cpp` checks that these literals +match the library; fix either side if the assertion fails. +""" + +from __future__ import annotations + +from enum import IntEnum +from typing import Any, Callable, Dict, List, Tuple, Union + +import tensorrt as trt +import torch +import torch_tensorrt +from torch_tensorrt._features import ENABLED_FEATURES + +ABI_VERSION = "9" + + +class SerializedInfoIndex(IntEnum): + """Indices into the serialized TensorRT engine tuple. + + Must stay aligned with ``SerializedInfoIndex`` in ``core/runtime/runtime.h``. + """ + + ABI_TARGET_IDX = 0 + NAME_IDX = 1 + DEVICE_IDX = 2 + ENGINE_IDX = 3 + INPUT_BINDING_NAMES_IDX = 4 + OUTPUT_BINDING_NAMES_IDX = 5 + HW_COMPATIBLE_IDX = 6 + SERIALIZED_METADATA_IDX = 7 + TARGET_PLATFORM_IDX = 8 + REQUIRES_OUTPUT_ALLOCATOR_IDX = 9 + RESOURCE_ALLOCATION_STRATEGY_IDX = 10 + REQUIRES_NATIVE_MULTIDEVICE_IDX = 11 + + +# Module-level aliases for backward compatibility and concise access +ABI_TARGET_IDX = SerializedInfoIndex.ABI_TARGET_IDX +NAME_IDX = SerializedInfoIndex.NAME_IDX +DEVICE_IDX = SerializedInfoIndex.DEVICE_IDX +ENGINE_IDX = SerializedInfoIndex.ENGINE_IDX +INPUT_BINDING_NAMES_IDX = SerializedInfoIndex.INPUT_BINDING_NAMES_IDX +OUTPUT_BINDING_NAMES_IDX = SerializedInfoIndex.OUTPUT_BINDING_NAMES_IDX +HW_COMPATIBLE_IDX = SerializedInfoIndex.HW_COMPATIBLE_IDX +SERIALIZED_METADATA_IDX = SerializedInfoIndex.SERIALIZED_METADATA_IDX +TARGET_PLATFORM_IDX = SerializedInfoIndex.TARGET_PLATFORM_IDX +REQUIRES_OUTPUT_ALLOCATOR_IDX = SerializedInfoIndex.REQUIRES_OUTPUT_ALLOCATOR_IDX +RESOURCE_ALLOCATION_STRATEGY_IDX = SerializedInfoIndex.RESOURCE_ALLOCATION_STRATEGY_IDX +REQUIRES_NATIVE_MULTIDEVICE_IDX = SerializedInfoIndex.REQUIRES_NATIVE_MULTIDEVICE_IDX +SERIALIZATION_LEN = len(SerializedInfoIndex) + +SERIALIZED_ENGINE_BINDING_DELIM = "%" +SERIALIZED_RT_DEVICE_DELIM = "%" + +# (torch.ops.tensorrt name, module global holding the expected value, normalizer) +_LayoutCheck = Tuple[str, str, Callable[[Any], Any]] +_LAYOUT_CPP_CHECKS: tuple[_LayoutCheck, ...] = ( + ("ABI_VERSION", "ABI_VERSION", str), + ("ABI_TARGET_IDX", "ABI_TARGET_IDX", int), + ("NAME_IDX", "NAME_IDX", int), + ("DEVICE_IDX", "DEVICE_IDX", int), + ("ENGINE_IDX", "ENGINE_IDX", int), + ("INPUT_BINDING_NAMES_IDX", "INPUT_BINDING_NAMES_IDX", int), + ("OUTPUT_BINDING_NAMES_IDX", "OUTPUT_BINDING_NAMES_IDX", int), + ("HW_COMPATIBLE_IDX", "HW_COMPATIBLE_IDX", int), + ("SERIALIZED_METADATA_IDX", "SERIALIZED_METADATA_IDX", int), + ("TARGET_PLATFORM_IDX", "TARGET_PLATFORM_IDX", int), + ("REQUIRES_OUTPUT_ALLOCATOR_IDX", "REQUIRES_OUTPUT_ALLOCATOR_IDX", int), + ("RESOURCE_ALLOCATION_STRATEGY_IDX", "RESOURCE_ALLOCATION_STRATEGY_IDX", int), + ("REQUIRES_NATIVE_MULTIDEVICE_IDX", "REQUIRES_NATIVE_MULTIDEVICE_IDX", int), + ("SERIALIZATION_LEN", "SERIALIZATION_LEN", int), + ("SERIALIZED_ENGINE_BINDING_DELIM", "SERIALIZED_ENGINE_BINDING_DELIM", str), + ("SERIALIZED_RT_DEVICE_DELIM", "SERIALIZED_RT_DEVICE_DELIM", str), +) + + +def _assert_serialized_layout_matches_cpp() -> None: + """Fail fast if Python layout literals diverge from ``register_jit_hooks.cpp``.""" + if not ENABLED_FEATURES.torch_tensorrt_runtime: + return + for op_name, global_name, normalizer in _LAYOUT_CPP_CHECKS: + expected = globals()[global_name] + try: + op = getattr(torch.ops.tensorrt, op_name) + raw = op() + except (AttributeError, RuntimeError, TypeError) as e: + raise RuntimeError( + f"Could not call torch.ops.tensorrt.{op_name}() to verify serialized layout: {e}" + ) from e + got = normalizer(raw) + if got != expected: + raise RuntimeError( + f"Serialized engine layout mismatch: torch.ops.tensorrt.{op_name}() " + f"returned {got!r} but Python _serialized_engine_layout.{global_name} " + f"is {expected!r}. Align ``runtime.h`` / ``register_jit_hooks.cpp`` with " + f"``_serialized_engine_layout.py``." + ) + + +_assert_serialized_layout_matches_cpp() + +SerializedTensorRTEngineFmt = List[Union[str, bytes]] + + +def serialize_binding_names(binding_names: List[str]) -> str: + return SERIALIZED_ENGINE_BINDING_DELIM.join(binding_names) + + +def deserialize_binding_names(binding_names: str) -> List[str]: + return binding_names.split(SERIALIZED_ENGINE_BINDING_DELIM) if binding_names else [] + + +def serialize_device_info(device: torch_tensorrt.Device) -> str: + dev_info = torch.cuda.get_device_properties(device.gpu_id) + rt_info = [ + device.gpu_id, + dev_info.major, + dev_info.minor, + int(device.device_type.to(trt.DeviceType)), + dev_info.name, + ] + return SERIALIZED_RT_DEVICE_DELIM.join(str(value) for value in rt_info) + + +def parse_device_info(serialized_device_info: str) -> Dict[str, Any]: + tokens = serialized_device_info.split(SERIALIZED_RT_DEVICE_DELIM) + if len(tokens) != 5: + raise RuntimeError( + f"Unable to deserialize program target device information: {serialized_device_info}" + ) + + target_device_id = int(tokens[0]) + return { + "id": target_device_id, + "major": int(tokens[1]), + "minor": int(tokens[2]), + "device_type": int(tokens[3]), + "name": tokens[4], + } diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index e03c88153c..77c6c35f77 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -305,7 +305,7 @@ def __getstate__(self) -> Any: pass -@torch.library.custom_op( # type: ignore +@torch.library.custom_op( # type: ignore[misc] "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=() ) def no_op_placeholder_for_execute_engine( @@ -320,6 +320,7 @@ def no_op_placeholder_for_execute_engine( serialized_metadata: str, serialized_target_platform: str, serialized_require_output_allocator: str, + serialized_resource_allocation_strategy: str, ) -> List[torch.Tensor]: raise RuntimeError( "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 0c7a5565df..db3e1cea45 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -24,12 +24,9 @@ import sympy import tensorrt as trt import torch -from torch._subclasses.fake_tensor import FakeTensor -from torch._subclasses.fake_tensor import FakeScriptObject +from torch._subclasses.fake_tensor import FakeScriptObject, FakeTensor from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.utils._sympy.numbers import int_oo - -from packaging import version from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES @@ -40,6 +37,8 @@ from torch_tensorrt.dynamo._engine_cache import BaseEngineCache from torch_tensorrt.dynamo._settings import CompilationSettings +from packaging import version + from .types import TRTDataType logger = logging.getLogger(__name__) @@ -105,7 +104,7 @@ class Frameworks(Enum): torch.complex128: torch.float64, } -COMPLEX_DTYPES: frozenset = frozenset(COMPLEX_TO_REAL_DTYPE) +COMPLEX_DTYPES: frozenset = frozenset(COMPLEX_TO_REAL_DTYPE) # type: ignore[type-arg] def unified_dtype_converter( @@ -147,37 +146,6 @@ def deallocate_module(module: torch.fx.GraphModule) -> None: gc.collect() -def use_python_runtime_parser(use_python_runtime: Optional[bool] = None) -> bool: - """Parses a user-provided input argument regarding Python runtime - - Automatically handles cases where the user has not specified a runtime (None) - - Returns True if the Python runtime should be used, False if the C++ runtime should be used - """ - using_python_runtime = use_python_runtime - reason = "" - - # Runtime was manually specified by the user - if using_python_runtime is not None: - reason = "as requested by user" - # Runtime was not manually specified by the user, automatically detect runtime - else: - try: - from torch_tensorrt.dynamo.runtime import TorchTensorRTModule # noqa: F401 - - using_python_runtime = False - reason = "since C++ dependency was detected as present" - except ImportError: - using_python_runtime = True - reason = "since import failed, C++ dependency not installed" - - logger.info( - f"Using {'Python-only' if using_python_runtime else 'Default'} Torch-TRT Runtime ({reason})" - ) - - return using_python_runtime - - def cosine_similarity(gt_tensor: torch.Tensor, pred_tensor: torch.Tensor) -> float: gt_tensor = gt_tensor.flatten().to(torch.float32) pred_tensor = pred_tensor.flatten().to(torch.float32) @@ -624,6 +592,16 @@ def parse_dynamo_kwargs( if "options" in kwargs and len(kwargs) == 1: kwargs = kwargs["options"] + # TODO: Uncomment this when cross serialization is enabled + # if "use_python_runtime" in kwargs: + # warnings.warn( + # 'torch.compile option "use_python_runtime" was removed; use ' + # "the Python runtime is now selected automatically when the C++ extension is unavailable.", + # DeprecationWarning, + # stacklevel=2, + # ) + # kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} + if "truncate_long_and_double" in kwargs: if ( "truncate_double" in kwargs @@ -649,9 +627,6 @@ def parse_dynamo_kwargs( valid_kwargs = {k: v for k, v in kwargs.items() if k in valid_attrs} settings = replace(settings, **valid_kwargs) - # Parse input runtime specification - settings.use_python_runtime = use_python_runtime_parser(settings.use_python_runtime) - # Ensure device is a torch_tensorrt Device settings.device = to_torch_tensorrt_device(settings.device) diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py index cfc9b322b5..7283ca0f33 100644 --- a/py/torch_tensorrt/runtime/__init__.py +++ b/py/torch_tensorrt/runtime/__init__.py @@ -1,5 +1,4 @@ from torch_tensorrt.dynamo.runtime import ( # noqa: F401 - PythonTorchTensorRTModule, TorchTensorRTModule, ) from torch_tensorrt.runtime._cudagraphs import ( diff --git a/py/torch_tensorrt/runtime/_output_allocator.py b/py/torch_tensorrt/runtime/_output_allocator.py index 163fc26306..6eb67b7218 100644 --- a/py/torch_tensorrt/runtime/_output_allocator.py +++ b/py/torch_tensorrt/runtime/_output_allocator.py @@ -2,7 +2,7 @@ from typing import Any, Union import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( CudaGraphsTorchTensorRTModule, ) @@ -24,9 +24,7 @@ def __init__( rt_mods = [] for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append(rt_mod) self.rt_mods = rt_mods diff --git a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py index c392c38838..7c3629b28a 100644 --- a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py +++ b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py @@ -2,7 +2,7 @@ from typing import Any import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule logger = logging.getLogger(__name__) @@ -15,9 +15,7 @@ class _PreAllocatedOutputContextManager(object): def __init__(self, module: torch.fx.GraphModule) -> None: rt_mods = [] for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append(rt_mod) self.rt_mods = rt_mods diff --git a/py/torch_tensorrt/runtime/_utils.py b/py/torch_tensorrt/runtime/_utils.py index bc2e5a6a70..929d88f8af 100644 --- a/py/torch_tensorrt/runtime/_utils.py +++ b/py/torch_tensorrt/runtime/_utils.py @@ -1,9 +1,18 @@ import logging -from typing import List, Optional, Tuple +from typing import Optional, Protocol, Tuple import torch import torch_tensorrt + +class _ComparableDeviceProps(Protocol): + """Enough for multi-device checks; may be ``_CudaDeviceProperties`` or a simple namespace.""" + + major: int + minor: int + name: object + + logger = logging.getLogger(__name__) @@ -27,7 +36,7 @@ def _is_switch_required( curr_device_id: int, engine_device_id: int, curr_device_properties: torch._C._CudaDeviceProperties, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> bool: """Determines whether a device switch is required based on input device parameters""" # Device Capabilities disagree @@ -66,7 +75,7 @@ def _is_switch_required( def _select_rt_device( curr_device_id: int, engine_device_id: int, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> Tuple[int, torch._C._CudaDeviceProperties]: """Wraps compatible device check and raises error if none are found""" new_target_device_opt = _get_most_compatible_device( @@ -83,7 +92,7 @@ def _select_rt_device( def _get_most_compatible_device( curr_device_id: int, engine_device_id: int, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> Optional[Tuple[int, torch._C._CudaDeviceProperties]]: """Selects a runtime device based on compatibility checks""" all_devices = [ diff --git a/py/torch_tensorrt/runtime/_weight_streaming.py b/py/torch_tensorrt/runtime/_weight_streaming.py index 0874d31d11..d294da1731 100755 --- a/py/torch_tensorrt/runtime/_weight_streaming.py +++ b/py/torch_tensorrt/runtime/_weight_streaming.py @@ -2,7 +2,7 @@ from typing import Any, Union import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( CudaGraphsTorchTensorRTModule, ) @@ -26,9 +26,7 @@ def __init__( self.cuda_graphs_module = module module = module.compiled_module for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append((name, rt_mod)) self.current_device_budget += rt_mod.get_device_memory_budget() self.streamable_budget = [ diff --git a/tests/py/dynamo/backend/test_backend_compiler.py b/tests/py/dynamo/backend/test_backend_compiler.py index 709a7f3383..89f16f1072 100644 --- a/tests/py/dynamo/backend/test_backend_compiler.py +++ b/tests/py/dynamo/backend/test_backend_compiler.py @@ -49,7 +49,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -129,7 +128,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = model(*inputs).detach().cpu() @@ -170,7 +168,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, optimization_level=4, version_compatible=True, max_aux_streams=5, diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 8a75f4d6f8..5678c4fd72 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -27,7 +27,7 @@ pre_export_lowering, ) from torch_tensorrt.dynamo.lowering.passes import remove_num_users_is_0_nodes -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ATOL, RTOL, get_model_device, get_torch_inputs _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -197,7 +197,7 @@ def run_test( atol=ATOL, check_dtype=True, pyt_inputs=None, - rt_cls=PythonTorchTensorRTModule, + rt_cls=TorchTensorRTModule, ): with torch.no_grad(): cuda_inputs = [] @@ -208,7 +208,7 @@ def run_test( interpreter_result = interpreter.run() sec = time.perf_counter() - start _LOGGER.info(f"Interpreter run time(s): {sec}") - serialized_engine = interpreter_result.engine.serialize() + serialized_engine = bytes(interpreter_result.engine.serialize()) trt_mod = rt_cls( serialized_engine=serialized_engine, input_binding_names=list(interpreter_result.input_names), @@ -269,7 +269,7 @@ def run_test_custom_compare_results( interpreter, comparators: List[Tuple[Callable, List]], fp16_mode=False, - rt_cls=PythonTorchTensorRTModule, + rt_cls=TorchTensorRTModule, ): """ Runs the test and compares the result using the provided comparators. @@ -292,7 +292,7 @@ def run_test_custom_compare_results( self.assert_has_op(mod, expected_ops) interpreter_result = interpreter.run() - serialized_engine = interpreter_result.engine.serialize() + serialized_engine = bytes(interpreter_result.engine.serialize()) trt_mod = rt_cls( serialized_engine=serialized_engine, input_binding_names=list(interpreter_result.input_names), diff --git a/tests/py/dynamo/conversion/test_index_put_aten.py b/tests/py/dynamo/conversion/test_index_put_aten.py index bafc581aac..5536c63a3c 100644 --- a/tests/py/dynamo/conversion/test_index_put_aten.py +++ b/tests/py/dynamo/conversion/test_index_put_aten.py @@ -504,7 +504,6 @@ def forward(self, source_tensor, indices_tensor, value_tensor): min_block_size=1, use_fp32_acc=False, disable_tf32=True, - use_python_runtime=True, ) result = trt_engine(source_tensor, indices_tensor, value_tensor) diff --git a/tests/py/dynamo/distributed/test_distributed_simple_example.py b/tests/py/dynamo/distributed/test_distributed_simple_example.py index ac7dc4de09..8009f31605 100644 --- a/tests/py/dynamo/distributed/test_distributed_simple_example.py +++ b/tests/py/dynamo/distributed/test_distributed_simple_example.py @@ -67,7 +67,6 @@ def forward(self, x): backend=backend, options={ "truncate_long_and_double": True, - "use_python_runtime": True, "min_block_size": 1, "use_distributed_mode_trace": True, }, diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py index 424cf145fc..b54d05a6cb 100644 --- a/tests/py/dynamo/lowering/test_aten_lowering_passes.py +++ b/tests/py/dynamo/lowering/test_aten_lowering_passes.py @@ -271,7 +271,7 @@ def forward(self, x: torch.Tensor): trt_module = torch.compile( model, backend="tensorrt", - options={"use_python_runtime": False, "min_block_size": 1}, + options={"min_block_size": 1}, ) out = trt_module(inputs) # if the model can be successfully compiled, we regard the test as passed diff --git a/tests/py/dynamo/models/_cross_runtime_load_helper.py b/tests/py/dynamo/models/_cross_runtime_load_helper.py new file mode 100644 index 0000000000..5bfad8b780 --- /dev/null +++ b/tests/py/dynamo/models/_cross_runtime_load_helper.py @@ -0,0 +1,152 @@ +"""Subprocess helper: run TRT operations in a Python-only environment. + +This script temporarily hides the Torch-TensorRT C++ shared libraries so that +``torch_tensorrt`` imports in Python-only mode. + +Modes: + load — Load a .pt2 artifact, run inference, save output. + save — Compile a model and save a .pt2 artifact. + +Usage (called by test_cross_runtime_serde.py, not directly): + python _cross_runtime_load_helper.py load \ + --artifact --input --output + + python _cross_runtime_load_helper.py save \ + --model-state --input --artifact +""" + +from __future__ import annotations + +import argparse +import glob +import os + + +def _build_small_conv_model(torch): + class SmallConvModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) + self.relu = torch.nn.ReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.relu(self.conv(x)) + + return SmallConvModel() + + +def _normalize_outputs(result): + if isinstance(result, tuple): + return result + if isinstance(result, list): + return tuple(result) + return (result,) + + +def _save_inference_output(ep, inp, output_path: str) -> None: + import torch + + with torch.no_grad(): + result = ep.module()(inp) + torch.save(_normalize_outputs(result), output_path) + + +def _assert_python_runtime_only(torchtrt) -> None: + assert ( + not torchtrt.ENABLED_FEATURES.torch_tensorrt_runtime + ), "C++ runtime should be disabled" + + +def _compile_spec(inp, torchtrt) -> dict: + import torch + + return { + "inputs": [ + torchtrt.Input(inp.shape, dtype=torch.float, format=torch.contiguous_format) + ], + "ir": "dynamo", + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + } + + +def _hide_so_files(pkg_dir: str) -> list[tuple[str, str]]: + """Rename .so/.dll files in pkg_dir/lib so _features.py sees them as absent.""" + lib_dir = os.path.join(pkg_dir, "lib") + if not os.path.isdir(lib_dir): + return [] + moved: list[tuple[str, str]] = [] + for path in glob.glob(os.path.join(lib_dir, "libtorchtrt*")): + bak = path + ".bak" + os.rename(path, bak) + moved.append((bak, path)) + return moved + + +def _restore_so_files(moved: list[tuple[str, str]]) -> None: + for bak, orig in moved: + if os.path.exists(bak): + os.rename(bak, orig) + + +def _do_load(args: argparse.Namespace) -> None: + """Load a pre-saved .pt2, run inference, save output.""" + import torch + import torch_tensorrt as torchtrt + + _assert_python_runtime_only(torchtrt) + + ep = torchtrt.load(args.artifact) + inp = torch.load(args.input, weights_only=True) + _save_inference_output(ep, inp, args.output) + + +def _do_save(args: argparse.Namespace) -> None: + """Compile and save a .pt2 artifact in Python-only mode.""" + import torch + import torch_tensorrt as torchtrt + + _assert_python_runtime_only(torchtrt) + + model = _build_small_conv_model(torch).eval().cuda() + model.load_state_dict(torch.load(args.model_state, weights_only=True)) + inp = torch.load(args.input, weights_only=True) + compile_spec = _compile_spec(inp, torchtrt) + exp_program = torchtrt.dynamo.trace(model, **compile_spec) + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, args.artifact, retrace=False) + + +def main() -> None: + parser = argparse.ArgumentParser() + sub = parser.add_subparsers(dest="mode", required=True) + + p_load = sub.add_parser("load") + p_load.add_argument("--artifact", required=True) + p_load.add_argument("--input", required=True) + p_load.add_argument("--output", required=True) + + p_save = sub.add_parser("save") + p_save.add_argument("--model-state", required=True) + p_save.add_argument("--input", required=True) + p_save.add_argument("--artifact", required=True) + + args = parser.parse_args() + + import importlib.util + + spec = importlib.util.find_spec("torch_tensorrt") + assert spec and spec.origin + pkg_dir = os.path.dirname(spec.origin) + + handlers = {"load": _do_load, "save": _do_save} + moved = _hide_so_files(pkg_dir) + try: + handlers[args.mode](args) + finally: + _restore_so_files(moved) + + +if __name__ == "__main__": + main() diff --git a/tests/py/dynamo/models/test_autocast.py b/tests/py/dynamo/models/test_autocast.py index 9712df4a8d..7691ba1029 100644 --- a/tests/py/dynamo/models/test_autocast.py +++ b/tests/py/dynamo/models/test_autocast.py @@ -52,7 +52,6 @@ def forward(self, x): ep.module(), arg_inputs=inputs, min_block_size=1, - use_python_runtime=True, enable_autocast=True, autocast_low_precision_type=torch.float16, autocast_excluded_nodes={"^conv1$", "relu"}, @@ -142,7 +141,6 @@ def forward(self, x): ep, arg_inputs=inputs, min_block_size=1, - use_python_runtime=True, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, autocast_low_precision_type=torch.bfloat16, @@ -221,7 +219,6 @@ def forward(self, x): ep.module(), arg_inputs=inputs, min_block_size=1, - use_python_runtime=False, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, autocast_low_precision_type=torch.bfloat16, @@ -328,7 +325,6 @@ def forward(self, x, y): ep, arg_inputs=inputs, min_block_size=1, - use_python_runtime=False, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, autocast_low_precision_type=torch.bfloat16, diff --git a/tests/py/dynamo/models/test_cross_runtime_serde.py b/tests/py/dynamo/models/test_cross_runtime_serde.py new file mode 100644 index 0000000000..8b926563f9 --- /dev/null +++ b/tests/py/dynamo/models/test_cross_runtime_serde.py @@ -0,0 +1,199 @@ +"""Tests for cross-runtime save/load of .pt2 TRT artifacts. + +Verifies that an ExportedProgram saved with the C++ Torch-TensorRT runtime can +be loaded and executed in a Python-only environment (no libtorchtrt*.so), and +that inference results match. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import unittest + +import pytest +import torch +import torch_tensorrt as torchtrt +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity + +assertions = unittest.TestCase() + +HELPER_SCRIPT = os.path.join(os.path.dirname(__file__), "_cross_runtime_load_helper.py") + + +class SmallConvModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) + self.relu = torch.nn.ReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.relu(self.conv(x)) + + +def _compile_and_save( + model: torch.nn.Module, inp: torch.Tensor, path: str +) -> torch.Tensor: + """Compile *model* with TRT, save the artifact, return eager TRT output.""" + compile_spec = { + "inputs": [ + torchtrt.Input(inp.shape, dtype=torch.float, format=torch.contiguous_format) + ], + "ir": "dynamo", + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + } + exp_program = torchtrt.dynamo.trace(model, **compile_spec) + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + + with torch.no_grad(): + reference_output = trt_module(inp) + + torchtrt.save(trt_module, path, retrace=False) + return reference_output + + +def _tmp_paths(tmpdir): + base = str(tmpdir) + return { + "artifact": os.path.join(base, "trt.ep"), + "model_state": os.path.join(base, "model_state.pt"), + "input": os.path.join(base, "input.pt"), + "output": os.path.join(base, "output.pt"), + } + + +def _run_helper(args: list[str]) -> None: + """Run _cross_runtime_load_helper.py with given args; raise on failure.""" + result = subprocess.run( + [sys.executable, HELPER_SCRIPT] + args, + capture_output=True, + text=True, + timeout=300, + ) + if result.returncode != 0: + raise RuntimeError( + f"Python-only subprocess failed (rc={result.returncode}).\n" + f"--- stdout ---\n{result.stdout}\n" + f"--- stderr ---\n{result.stderr}" + ) + + +def _assert_outputs_match( + reference: torch.Tensor, loaded: torch.Tensor, label: str +) -> None: + reference = reference[0] if isinstance(reference, (tuple, list)) else reference + loaded = loaded[0] if isinstance(loaded, (tuple, list)) else loaded + cos_sim = cosine_similarity(reference.cpu(), loaded.cpu()) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"{label}: cosine similarity {cos_sim} < {COSINE_THRESHOLD}", + ) + + +@pytest.mark.unit +def test_save_cpp_load_python(tmpdir): + """Save with C++ runtime active, load in Python-only subprocess.""" + if not torchtrt.ENABLED_FEATURES.torch_tensorrt_runtime: + pytest.skip("C++ runtime not available; nothing to cross-test") + + paths = _tmp_paths(tmpdir) + + model = SmallConvModel().eval().cuda() + inp = torch.randn(1, 3, 32, 32, device="cuda") + + reference_output = _compile_and_save(model, inp, paths["artifact"]) + + torch.save(inp, paths["input"]) + _run_helper( + [ + "load", + "--artifact", + paths["artifact"], + "--input", + paths["input"], + "--output", + paths["output"], + ] + ) + python_output = torch.load(paths["output"], weights_only=True) + + _assert_outputs_match(reference_output, python_output, "save_cpp_load_python") + + +@pytest.mark.unit +def test_save_python_load_python(tmpdir): + """Save and load entirely in Python-only subprocesses.""" + paths = _tmp_paths(tmpdir) + + model = SmallConvModel().eval().cuda() + inp = torch.randn(1, 3, 32, 32, device="cuda") + + torch.save(model.state_dict(), paths["model_state"]) + torch.save(inp, paths["input"]) + + with torch.no_grad(): + pytorch_output = model(inp) + + _run_helper( + [ + "save", + "--model-state", + paths["model_state"], + "--input", + paths["input"], + "--artifact", + paths["artifact"], + ] + ) + _run_helper( + [ + "load", + "--artifact", + paths["artifact"], + "--input", + paths["input"], + "--output", + paths["output"], + ] + ) + python_output = torch.load(paths["output"], weights_only=True) + + _assert_outputs_match(pytorch_output, python_output, "save_python_load_python") + + +@pytest.mark.unit +def test_save_python_load_cpp(tmpdir): + """Save in Python-only subprocess, load in C++ runtime.""" + if not torchtrt.ENABLED_FEATURES.torch_tensorrt_runtime: + pytest.skip("C++ runtime not available; nothing to cross-test") + + paths = _tmp_paths(tmpdir) + + model = SmallConvModel().eval().cuda() + inp = torch.randn(1, 3, 32, 32, device="cuda") + + with torch.no_grad(): + pytorch_output = model(inp) + + torch.save(model.state_dict(), paths["model_state"]) + torch.save(inp, paths["input"]) + _run_helper( + [ + "save", + "--model-state", + paths["model_state"], + "--input", + paths["input"], + "--artifact", + paths["artifact"], + ] + ) + + loaded_ep = torchtrt.load(paths["artifact"]) + with torch.no_grad(): + cpp_output = loaded_ep.module()(inp) + + _assert_outputs_match(pytorch_output, cpp_output, "save_python_load_cpp") diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index 77309b7507..bf7d7d07ee 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -41,7 +41,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=True, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, ) @@ -81,7 +80,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=True, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, ) @@ -127,7 +125,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=False, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, ) @@ -168,7 +165,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=False, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, ) @@ -222,7 +218,6 @@ def forward(self, x): inputs=[in_tensor], pass_through_build_failures=True, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, ) @@ -261,7 +256,6 @@ def forward(self, x): inputs=[in_tensor], pass_through_build_failures=True, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, ) diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 8ccd48df2c..3bd6a25dbf 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -232,7 +232,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=cache_built_engines, @@ -306,7 +305,6 @@ def test_dynamo_compile_with_custom_engine_cache(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=cache_built_engines, @@ -363,7 +361,6 @@ def test_dynamo_compile_change_input_shape(self): trt_gm = torch_trt.dynamo.compile( torch.export.export(model, args=inputs), inputs=inputs, - use_python_runtime=False, min_block_size=1, immutable_weights=False, cache_built_engines=True, @@ -423,7 +420,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): model, backend="tensorrt", options={ - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": cache_built_engines, @@ -487,7 +483,6 @@ def test_torch_compile_with_custom_engine_cache(self): model, backend="tensorrt", options={ - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": cache_built_engines, @@ -542,7 +537,6 @@ def test_torch_trt_compile_change_input_shape(self): model, inputs=inputs, **{ - "use_python_runtime": True, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": True, @@ -583,7 +577,6 @@ def forward(self, x): model, backend="tensorrt", options={ - "use_python_runtime": True, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": True, @@ -682,7 +675,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, inputs, - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=False, @@ -733,7 +725,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, min_block_size=1, cache_built_engines=cache_built_engines, reuse_cached_engines=reuse_cached_engines, @@ -909,7 +900,6 @@ def remove_timing_cache(path=timing_cache_path): trt_gm = torch_trt.dynamo.compile( llama2_ep, inputs=[input_ids], - use_python_runtime=True, min_block_size=1, immutable_weights=False, truncate_double=True, @@ -961,7 +951,6 @@ def remove_timing_cache(path=timing_cache_path): trt_gm = torch_trt.dynamo.compile( llama2_ep, inputs=[input_ids], - use_python_runtime=True, min_block_size=1, truncate_double=True, device=DEVICE, diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py index 919645ddc8..0af0ca5f89 100644 --- a/tests/py/dynamo/models/test_model_refit.py +++ b/tests/py/dynamo/models/test_model_refit.py @@ -50,15 +50,12 @@ def test_mapping(): for i in inputs ] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -114,15 +111,12 @@ def forward(self, x): model2 = net().eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -177,15 +171,12 @@ def forward(self, x): model2 = net().eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -240,15 +231,12 @@ def forward(self, x): model2 = net().eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -294,15 +282,12 @@ def test_refit_one_engine_with_weightmap(): model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -348,15 +333,12 @@ def test_refit_one_engine_no_map_with_weightmap(): model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -403,15 +385,12 @@ def test_refit_one_engine_with_wrong_weightmap(): model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -467,15 +446,12 @@ def test_refit_one_engine_bert_with_weightmap(): model2 = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda") nn.init.xavier_normal_(model2.embeddings.word_embeddings.weight) min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -530,15 +506,12 @@ def test_refit_one_engine_inline_runtime_with_weightmap(tmpdir): model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs), strict=False) exp_program2 = torch.export.export(model2, tuple(inputs), strict=False) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -586,15 +559,12 @@ def test_refit_one_engine_python_runtime_with_weightmap(): model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -655,8 +625,6 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -664,7 +632,6 @@ def forward(self, x): trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, torch_executed_ops=torch_executed_ops, @@ -719,8 +686,6 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -728,7 +693,6 @@ def forward(self, x): trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, torch_executed_ops=torch_executed_ops, @@ -779,15 +743,12 @@ def test_refit_one_engine_without_weightmap(): model2 = models.resnet18(pretrained=False).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -837,15 +798,12 @@ def test_refit_one_engine_bert_without_weightmap(): model2 = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda") nn.init.xavier_normal_(model2.embeddings.word_embeddings.weight) min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -899,15 +857,12 @@ def test_refit_one_engine_inline_runtime_without_weightmap(tmpdir): model2 = models.resnet18(pretrained=False).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -953,15 +908,12 @@ def test_refit_one_engine_python_runtime_without_weightmap(): model2 = models.resnet18(pretrained=False).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, ) @@ -1022,8 +974,6 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -1031,7 +981,6 @@ def forward(self, x): trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=min_block_size, immutable_weights=False, torch_executed_ops=torch_executed_ops, @@ -1167,7 +1116,6 @@ def make_freqs() -> torch.Tensor: trt_gm = torchtrt.dynamo.compile( exp_program1, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, ) @@ -1239,7 +1187,6 @@ def make_freqs() -> torch.Tensor: trt_gm = torchtrt.dynamo.compile( exp_program1, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, ) @@ -1309,7 +1256,6 @@ def make_freqs() -> torch.Tensor: trt_gm = torchtrt.dynamo.compile( exp_program1, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, ) diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 9c7b0f9026..eb0a32db76 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -498,7 +498,6 @@ def forward( use_fp32_acc=False, device="cuda:0", disable_tf32=True, - use_python_runtime=True, min_block_size=1, ) trt_output = trt_model(hidden_states) diff --git a/tests/py/dynamo/models/test_symint_scalar_input.py b/tests/py/dynamo/models/test_symint_scalar_input.py index 0d44e5d553..9ab3104064 100644 --- a/tests/py/dynamo/models/test_symint_scalar_input.py +++ b/tests/py/dynamo/models/test_symint_scalar_input.py @@ -20,8 +20,8 @@ @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_from_size_used_in_reshape(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_from_size_used_in_reshape(runtime_backend): """ Test that a SymInt derived from tensor.size(0) can be used in reshape when it becomes a scalar placeholder input to the TRT subgraph. @@ -48,7 +48,6 @@ def forward(self, x, targets): compile_spec = { "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) @@ -59,15 +58,15 @@ def forward(self, x, targets): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"SymInt reshape test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"SymInt reshape test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_scalar_tensor_input(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_scalar_tensor_input(runtime_backend): """ Test that a 0-dim scalar tensor input (e.g., cache_length) is handled correctly during symbolic shape extraction and TRT compilation. @@ -85,7 +84,6 @@ def forward(self, x, offset): compile_spec = { "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) @@ -96,15 +94,15 @@ def forward(self, x, offset): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Scalar tensor input test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"Scalar tensor input test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_with_index_and_reshape(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_with_index_and_reshape(runtime_backend): """ Full reproduction of issue #4107 pattern: symbolic size from int64 tensor, used with index operation and reshape. @@ -138,7 +136,6 @@ def forward(self, x, targets, cache_length): "min_block_size": 1, "truncate_double": True, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) @@ -149,15 +146,15 @@ def forward(self, x, targets, cache_length): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Issue 4107 repro test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"Issue 4107 repro test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_with_different_batch_sizes(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_with_different_batch_sizes(runtime_backend): """ Test that after compilation with a SymInt scalar input, the model produces correct results with different batch sizes. @@ -179,7 +176,6 @@ def forward(self, x, targets): compile_spec = { "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) @@ -194,7 +190,7 @@ def forward(self, x, targets): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Varying batch size test (python_runtime={use_python_runtime}) failed at B={batch_size}. Cosine sim: {cos_sim}", + msg=f"Varying batch size test (runtime_backend={runtime_backend}) failed at B={batch_size}. Cosine sim: {cos_sim}", ) torch._dynamo.reset() diff --git a/tests/py/dynamo/models/test_weight_stripped_engine.py b/tests/py/dynamo/models/test_weight_stripped_engine.py index 268b43ff84..c94e9064c6 100644 --- a/tests/py/dynamo/models/test_weight_stripped_engine.py +++ b/tests/py/dynamo/models/test_weight_stripped_engine.py @@ -34,7 +34,6 @@ def test_three_ways_to_compile(self): exp_program = torch.export.export(pyt_model, example_inputs) settings = { - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "strip_engine_weights": False, @@ -83,7 +82,6 @@ def test_compile_weight_stripped_engine(self): example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) settings = { - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "strip_engine_weights": True, @@ -169,7 +167,6 @@ def test_weight_stripped_engine_results(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, strip_engine_weights=True, @@ -193,7 +190,6 @@ def test_weight_stripped_engine_results(self): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": False, @@ -239,7 +235,6 @@ def test_engine_caching_saves_weight_stripped_engine(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(example_inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, strip_engine_weights=False, @@ -316,7 +311,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=cache_built_engines, @@ -399,7 +393,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": cache_built_engines, @@ -475,7 +468,6 @@ def forward(self, x): pyt_model, backend="tensorrt", options={ - "use_python_runtime": True, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": True, @@ -517,7 +509,6 @@ def forward(self, x): inputs=tuple(inputs), min_block_size=1, immutable_weights=False, - use_python_runtime=True, strip_engine_weights=True, refit_identical_engine_weights=False, ) @@ -553,15 +544,9 @@ def test_two_TRTRuntime_in_refitting(self): pyt_results = pyt_model(*inputs) for i in range(2): - if i == 0: - use_python_runtime = True - else: - use_python_runtime = False - trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, min_block_size=1, immutable_weights=False, strip_engine_weights=True, @@ -576,7 +561,7 @@ def test_two_TRTRuntime_in_refitting(self): cos_sim = cosine_similarity(pyt_results, refitted_output) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"{'PythonTorchTensorRTModule' if use_python_runtime else 'TorchTensorRTModule'} outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"iteration {i}: TorchTensorRTModule outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) @unittest.skip("Waiting for implementation") @@ -600,7 +585,6 @@ def test_refit_identical_engine_weights(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(example_inputs), - use_python_runtime=True, min_block_size=1, immutable_weights=False, strip_engine_weights=True, @@ -652,7 +636,6 @@ def test_refit_weight_stripped_engine_multiple_times(self): trt_gm = torch_trt.dynamo.compile( exp_program, inputs, - use_python_runtime=True, min_block_size=1, immutable_weights=False, cache_built_engines=False, @@ -692,7 +675,6 @@ def test_refit_weight_stripped_engine_multiple_times(self): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "min_block_size": 1, "immutable_weights": False, "cache_built_engines": False, diff --git a/tests/py/dynamo/partitioning/test_000_resource_partitioning.py b/tests/py/dynamo/partitioning/test_000_resource_partitioning.py index df0ff3ced0..4e51f16156 100644 --- a/tests/py/dynamo/partitioning/test_000_resource_partitioning.py +++ b/tests/py/dynamo/partitioning/test_000_resource_partitioning.py @@ -44,12 +44,9 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, diff --git a/tests/py/dynamo/partitioning/test_001_resource_partitioning.py b/tests/py/dynamo/partitioning/test_001_resource_partitioning.py index 0e897fba25..81c34e2bf5 100644 --- a/tests/py/dynamo/partitioning/test_001_resource_partitioning.py +++ b/tests/py/dynamo/partitioning/test_001_resource_partitioning.py @@ -57,12 +57,9 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, @@ -141,12 +138,9 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, @@ -276,12 +270,9 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, @@ -378,12 +369,9 @@ def forward(self, x): model.to("cuda") inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "min_block_size": 1, "immutable_weights": True, "reuse_cached_engines": False, diff --git a/tests/py/dynamo/runtime/test_000_convert_module_to_trt_engine.py b/tests/py/dynamo/runtime/test_000_convert_module_to_trt_engine.py index 0b8720c6cd..924546bbc6 100644 --- a/tests/py/dynamo/runtime/test_000_convert_module_to_trt_engine.py +++ b/tests/py/dynamo/runtime/test_000_convert_module_to_trt_engine.py @@ -2,7 +2,7 @@ import torch import torch_tensorrt -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity @@ -27,9 +27,7 @@ def forward(self, a, b): ) # Inference on TRT Engine - py_trt_module = PythonTorchTensorRTModule( - trt_engine_str, ["a", "b"], ["output0"] - ) + py_trt_module = TorchTensorRTModule(trt_engine_str, ["a", "b"], ["output0"]) trt_output = py_trt_module(input_data_0, input_data_1).cpu() # Inference on PyTorch model diff --git a/tests/py/dynamo/runtime/test_000_python_runtime.py b/tests/py/dynamo/runtime/test_000_python_runtime.py index 0f94f6a704..0d0ace20aa 100644 --- a/tests/py/dynamo/runtime/test_000_python_runtime.py +++ b/tests/py/dynamo/runtime/test_000_python_runtime.py @@ -26,7 +26,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -57,7 +56,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -92,7 +90,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, offload_module_to_cpu=True, ) fx_graph.cuda() diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py index 657aa25f96..65dd60fd5d 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py @@ -65,7 +65,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, ) result_samples = [] @@ -107,7 +106,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -152,7 +150,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, ) result_samples = [] @@ -195,7 +192,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -248,7 +244,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, ) result_samples = [] diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py index 1dc7d32eb8..57c35e4e54 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py @@ -41,7 +41,6 @@ def forward(self, input): "torch_compile", inputs, min_block_size=1, - use_python_runtime=True, ) with torch_tensorrt.runtime.enable_cudagraphs(optimized_model) as _: self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode()) @@ -64,7 +63,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) result_samples = [] @@ -108,7 +106,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -154,7 +151,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, ) result_samples = [] @@ -198,7 +194,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -250,7 +245,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, ) result_samples = [] diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py index 57670c7bf8..4dbbda9d2a 100644 --- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py +++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py @@ -14,7 +14,7 @@ cosine_similarity, get_model_device, ) -from torch_tensorrt.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.runtime import TorchTensorRTModule assertions = unittest.TestCase() @@ -72,7 +72,7 @@ def forward(self, a, b): ) # Inference on TRT Engine - trt_module = PythonTorchTensorRTModule( + trt_module = TorchTensorRTModule( trt_engine_str, ["a", "b"], ["output0"], @@ -88,7 +88,7 @@ def forward(self, a, b): trt_output = trt_module(input_data_0, input_data_1).cpu() trt_module.setup_engine() - assertions.assertTrue(trt_module.engine, msg="Engine was not setup") + assertions.assertTrue(trt_module.engine is not None, msg="Engine was not setup") trt_output = trt_module(input_data_0, input_data_1).cpu() @@ -128,7 +128,7 @@ def forward(self, x): assert get_model_device(model).type == "cpu" model.cuda() # Inference on TRT Engine - trt_module = PythonTorchTensorRTModule( + trt_module = TorchTensorRTModule( trt_engine_str, ["x"], ["output0"], @@ -144,7 +144,7 @@ def forward(self, x): trt_output = trt_module(input_data).cpu() trt_module.setup_engine() - assertions.assertTrue(trt_module.engine, msg="Engine was not setup") + assertions.assertTrue(trt_module.engine is not None, msg="Engine was not setup") trt_output = trt_module(input_data).cpu() @@ -225,7 +225,6 @@ def test_lazy_engine_init_py_e2e(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": True, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -264,7 +263,6 @@ def test_lazy_engine_init_cpp_e2e(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -303,7 +301,6 @@ def test_lazy_engine_init_cpp_serialization(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -351,7 +348,6 @@ def forward(self, a, b): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": True, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, @@ -394,7 +390,6 @@ def forward(self, a, b): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, diff --git a/tests/py/dynamo/runtime/test_003_safe_mode.py b/tests/py/dynamo/runtime/test_003_safe_mode.py index 0fde0773ed..d144725081 100644 --- a/tests/py/dynamo/runtime/test_003_safe_mode.py +++ b/tests/py/dynamo/runtime/test_003_safe_mode.py @@ -46,7 +46,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -90,7 +89,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() diff --git a/tests/py/dynamo/runtime/test_004_weight_streaming.py b/tests/py/dynamo/runtime/test_004_weight_streaming.py index 58fe53ece0..5b8b0b94b6 100644 --- a/tests/py/dynamo/runtime/test_004_weight_streaming.py +++ b/tests/py/dynamo/runtime/test_004_weight_streaming.py @@ -39,13 +39,7 @@ def setUp(self): def tearDown(self): torchtrt.runtime.set_cudagraphs_mode(False) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_weight_streaming_default(self, _, use_python_runtime): + def test_weight_streaming_default(self): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) @@ -55,7 +49,6 @@ def test_weight_streaming_default(self, _, use_python_runtime): min_block_size=1, cache_built_engines=False, reuse_cached_engines=False, - use_python_runtime=use_python_runtime, enable_weight_streaming=True, ) # Checking if default weight streaming budget(automatic) is applied when compiler option was provided @@ -89,13 +82,7 @@ def test_weight_streaming_default(self, _, use_python_runtime): ) torch._dynamo.reset() - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_weight_streaming_manual(self, _, use_python_runtime): + def test_weight_streaming_manual(self): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) @@ -105,7 +92,6 @@ def test_weight_streaming_manual(self, _, use_python_runtime): min_block_size=1, cache_built_engines=False, reuse_cached_engines=False, - use_python_runtime=use_python_runtime, enable_weight_streaming=True, ) # Weight streaming budget is applied manually. @@ -148,13 +134,11 @@ def test_weight_streaming_manual(self, _, use_python_runtime): @parameterized.expand( [ - ("python_runtime", True, False), - ("python_runtime_multi_rt", True, True), - ("cpp_runtime", False, False), - ("cpp_runtime_multi_rt", False, True), + ("default", False), + ("multi_rt", True), ] ) - def test_weight_streaming_invalid_usage(self, _, use_python_runtime, multi_rt): + def test_weight_streaming_invalid_usage(self, _, multi_rt): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) @@ -167,7 +151,6 @@ def test_weight_streaming_invalid_usage(self, _, use_python_runtime, multi_rt): torch_executed_ops=( {"torch.ops.aten.convolution.default"} if multi_rt else {} ), - use_python_runtime=use_python_runtime, enable_weight_streaming=True, ) @@ -194,13 +177,7 @@ def test_weight_streaming_invalid_usage(self, _, use_python_runtime, multi_rt): torch._dynamo.reset() - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_weight_streaming_multi_rt(self, _, use_python_runtime): + def test_weight_streaming_multi_rt(self): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) @@ -212,7 +189,6 @@ def test_weight_streaming_multi_rt(self, _, use_python_runtime): cache_built_engines=False, reuse_cached_engines=False, torch_executed_ops={"torch.ops.aten.convolution.default"}, - use_python_runtime=use_python_runtime, enable_weight_streaming=True, ) @@ -238,13 +214,7 @@ def test_weight_streaming_multi_rt(self, _, use_python_runtime): torch._dynamo.reset() - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_weight_streaming_cudagraphs(self, _, use_python_runtime): + def test_weight_streaming_cudagraphs(self): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) @@ -256,7 +226,6 @@ def test_weight_streaming_cudagraphs(self, _, use_python_runtime): cache_built_engines=False, reuse_cached_engines=False, torch_executed_ops={"torch.ops.aten.convolution.default"}, - use_python_runtime=use_python_runtime, enable_weight_streaming=True, ) @@ -287,16 +256,10 @@ def test_weight_streaming_cudagraphs(self, _, use_python_runtime): ) torch._dynamo.reset() - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) @unittest.skipIf( is_orin(), "There is a bug on Orin platform, skip for now until bug is fixed" ) - def test_runtime_state_change(self, _, use_python_runtime): + def test_runtime_state_change(self): class SampleModel(torch.nn.Module): def __init__(self): super().__init__() @@ -352,7 +315,6 @@ def forward(self, x, b=None, c=None, d=None, e=[]): "reuse_cached_engines": False, "enable_weight_streaming": True, "torch_executed_ops": {"torch.ops.aten.mul.Tensor"}, - "use_python_runtime": use_python_runtime, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) optimized_model = torchtrt.dynamo.compile( diff --git a/tests/py/dynamo/runtime/test_005_dynamic_allocation.py b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py index 71847c8fbc..7dd22bb794 100644 --- a/tests/py/dynamo/runtime/test_005_dynamic_allocation.py +++ b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py @@ -29,7 +29,6 @@ def forward(self, x): settings = { "ir": "dynamo", - "use_python_runtime": False, "immutable_weights": False, "lazy_engine_init": True, "dynamically_allocate_resources": True, diff --git a/tests/py/dynamo/runtime/test_empty_input.py b/tests/py/dynamo/runtime/test_empty_input.py index 793eafb82c..59190f7d95 100644 --- a/tests/py/dynamo/runtime/test_empty_input.py +++ b/tests/py/dynamo/runtime/test_empty_input.py @@ -45,66 +45,32 @@ class TestConcatEmptyTensor(TestCase): @parameterized.expand( [ ( - "python_runtime_model_one_empty_0", - True, + "model_one_empty_0", ConcatEmptyModel, "two_inputs", (0,), ), ( - "cpp_runtime_model_one_empty_0", - False, + "model_one_empty_0_4", ConcatEmptyModel, "two_inputs", - (0,), - ), - ( - "python_runtime_model_one_empty_0_4", - True, - ConcatEmptyModel, - "two_inputs", - (0, 4), - ), - ( - "cpp_runtime_model_one_empty_0_4", - False, - ConcatEmptyModel, - "two_inputs", - (0, 4), - ), - ( - "python_runtime_model_two_empty_0_4", - True, - ConcatEmptyModelEmptyConstant, - "one_input", (0, 4), ), ( - "cpp_runtime_model_two_empty_0_4", - False, + "model_two_empty_0_4", ConcatEmptyModelEmptyConstant, "one_input", (0, 4), ), ( - "python_runtime_model_three_empty_0", - True, - ConcatEmptyModelEmptyConstantMisMatchDim, - "one_input", - (0,), - ), - ( - "cpp_runtime_model_three_empty_0", - False, + "model_three_empty_0", ConcatEmptyModelEmptyConstantMisMatchDim, "one_input", (0,), ), ] ) - def test_concat_empty_with_nonempty( - self, _, use_python_runtime, model_class, input_type, empty_shape - ): + def test_concat_empty_with_nonempty(self, _, model_class, input_type, empty_shape): """ Test concatenation of empty tensor with non-empty tensor along a specific dimension using Torch-TensorRT compiled model. @@ -127,7 +93,6 @@ def test_concat_empty_with_nonempty( "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) # Run reference model @@ -146,13 +111,11 @@ def test_concat_empty_with_nonempty( @parameterized.expand( [ - ("python_runtime_empty_0", True, (0,)), - ("cpp_runtime_empty_0", False, (0,)), - ("python_runtime_empty_0_4", True, (0, 4)), - ("cpp_runtime_empty_0_4", False, (0, 4)), + ("empty_0", (0,)), + ("empty_0_4", (0, 4)), ] ) - def test_concat_nonempty_with_empty(self, _, use_python_runtime, empty_shape): + def test_concat_nonempty_with_empty(self, _, empty_shape): """ Concatenate non-empty tensor with empty tensor (opposite order) """ @@ -167,7 +130,6 @@ def test_concat_nonempty_with_empty(self, _, use_python_runtime, empty_shape): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) ref_out = model(*inputs) @@ -188,13 +150,7 @@ class TestEmptyTensorMemoryLeak(TestCase): do not cause memory leaks and produce correct results. """ - @parameterized.expand( - [ - ("cpp_runtime", False), - ("python_runtime", True), - ] - ) - def test_repeated_empty_tensor_no_leak_and_correct(self, _, use_python_runtime): + def test_repeated_empty_tensor_no_leak_and_correct(self): """ Run many inferences with empty tensor input to verify: 1. Memory doesn't grow (placeholder is reused, not reallocated) @@ -211,7 +167,6 @@ def test_repeated_empty_tensor_no_leak_and_correct(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) # Record initial GPU memory diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py index f9eab20bfe..59532d23b9 100644 --- a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py +++ b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py @@ -221,7 +221,6 @@ def test_resnet18(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "immutable_weights": False, } @@ -267,7 +266,6 @@ def test_save(): # Compile the module for the first time and save it. # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ compile_spec = { - "use_python_runtime": False, "immutable_weights": False, } @@ -306,7 +304,6 @@ def test_resnet18_modify_attribute(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "immutable_weights": False, } @@ -350,7 +347,6 @@ def test_resnet18_modify_attribute_no_refit(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "immutable_weights": False, } diff --git a/tests/py/dynamo/runtime/test_output_allocator.py b/tests/py/dynamo/runtime/test_output_allocator.py index c94020705c..7c849c870c 100644 --- a/tests/py/dynamo/runtime/test_output_allocator.py +++ b/tests/py/dynamo/runtime/test_output_allocator.py @@ -3,7 +3,6 @@ import pytest import torch import torch_tensorrt -from parameterized import parameterized from torch.testing._internal.common_utils import TestCase, run_tests from ..testing_utilities import DECIMALS_OF_AGREEMENT @@ -48,13 +47,7 @@ def forward(self, input): "TensorRT RTX does not support nonzero which are required for this test", ) class TestOutputAllocatorStaticModel(TestCase): - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): + def test_cudagraphs_and_output_allocator(self): model = StaticModel().eval().cuda() inputs = [torch.randn((2, 3), dtype=torch.float).cuda()] compiled_model = torch_tensorrt.compile( @@ -62,7 +55,6 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) ref_out = model(*inputs) @@ -89,13 +81,7 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): msg="Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_default(self, _, use_python_runtime): + def test_default(self): """ Static models use standard execution with cudagraphs=False by default. """ @@ -106,7 +92,6 @@ def test_default(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) standard_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -118,13 +103,7 @@ def test_default(self, _, use_python_runtime): msg="Default standard execution (cudagraphs=False) outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_combination_of_cg_and_oa(self, _, use_python_runtime): + def test_combination_of_cg_and_oa(self): model = StaticModel().eval().cuda() inputs = [torch.randn((2, 3), dtype=torch.float).cuda()] compiled_model = torch_tensorrt.compile( @@ -132,7 +111,6 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) with pytest.raises( @@ -161,13 +139,7 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "TensorRT RTX does not support nonzero which are required for this test", ) class TestOutputAllocatorDDSModel(TestCase): - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): + def test_cudagraphs_and_output_allocator(self): model = DDSModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -175,7 +147,6 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) with pytest.raises( @@ -199,13 +170,7 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): msg="Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_default(self, _, use_python_runtime): + def test_default(self): """ DDS models use OutputAllocator by default. """ @@ -216,7 +181,6 @@ def test_default(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) oa_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -228,13 +192,7 @@ def test_default(self, _, use_python_runtime): msg="Default Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_combination_of_cg_and_oa(self, _, use_python_runtime): + def test_combination_of_cg_and_oa(self): model = DDSModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -242,7 +200,6 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) with pytest.raises( @@ -275,13 +232,7 @@ class TestOutputAllocatorDDSOpWithReductionOpModel(TestCase): The DDSOpWithReductionOpModel is a model that contains DDS op + reduction op. """ - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): + def test_cudagraphs_and_output_allocator(self): model = DDSOpWithReductionOpModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -289,7 +240,6 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) with pytest.raises( @@ -313,13 +263,7 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): msg="Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_default(self, _, use_python_runtime): + def test_default(self): """ The DDSOpWithReductionOpModel is a model that contains nonzero op + reduction op, in which nonzero op requires output allocator. """ @@ -330,7 +274,6 @@ def test_default(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) oa_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -342,13 +285,7 @@ def test_default(self, _, use_python_runtime): msg="Default Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_combination_of_cg_and_oa(self, _, use_python_runtime): + def test_combination_of_cg_and_oa(self): model = DDSOpWithReductionOpModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -356,7 +293,6 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, ) with pytest.raises( @@ -385,13 +321,7 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "TensorRT RTX does not support nonzero which are required for this test", ) class TestOutputAllocatorDDSModelWithGraphBreak(TestCase): - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): + def test_cudagraphs_and_output_allocator(self): model = DDSModel2().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -399,7 +329,6 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, torch_executed_ops={"torch.ops.aten.abs.default"}, ) @@ -424,13 +353,7 @@ def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): msg="Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_default(self, _, use_python_runtime): + def test_default(self): """ Use Output Allocator by default. """ @@ -441,7 +364,6 @@ def test_default(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, torch_executed_ops={"torch.ops.aten.abs.default"}, ) oa_out = compiled_model(*inputs) @@ -454,13 +376,7 @@ def test_default(self, _, use_python_runtime): msg="Default Output Allocator runtime outputs don't match with the original model.", ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_combination_of_cg_and_oa(self, _, use_python_runtime): + def test_combination_of_cg_and_oa(self): model = DDSModel2().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) compiled_model = torch_tensorrt.compile( @@ -468,7 +384,6 @@ def test_combination_of_cg_and_oa(self, _, use_python_runtime): "dynamo", inputs, min_block_size=1, - use_python_runtime=use_python_runtime, torch_executed_ops={"torch.ops.aten.abs.default"}, ) diff --git a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py index a9f8cfbbe5..c185416c3a 100644 --- a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py +++ b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py @@ -1,6 +1,5 @@ import torch import torch_tensorrt as torchtrt -from parameterized import parameterized from torch.testing._internal.common_utils import TestCase, run_tests INPUT_SIZE = (3, 16, 16) @@ -8,13 +7,7 @@ class TestPreAllocatedOutputs(TestCase): - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_default(self, _, use_python_runtime): + def test_pre_allocated_outputs_default(self): class SampleModel(torch.nn.Module): def forward(self, x): return torch.softmax((x + 2) * 7, dim=0) @@ -30,7 +23,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=use_python_runtime, ) ref_out_list = [] @@ -54,13 +46,7 @@ def forward(self, x): torch._dynamo.reset() - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_dynamic(self, _, use_python_runtime): + def test_pre_allocated_outputs_dynamic(self): class SampleModel(torch.nn.Module): def forward(self, x): return torch.relu((x + 2) * 0.5) @@ -81,7 +67,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=use_python_runtime, ) input_list = [] @@ -141,38 +126,29 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, torch_executed_ops={torch.ops.aten.add.Tensor}, ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): _ = optimized_model(inputs[0]) - output_tensors = [ - trt_mod.pre_allocated_outputs + output_tensors_ptr = [ + [t.data_ptr() for t in trt_mod.pre_allocated_outputs] for name, trt_mod in optimized_model.named_children() if "_run_on_acc" in name ] _ = optimized_model(inputs[0]) - new_output_tensors = [ - trt_mod.pre_allocated_outputs + new_output_tensors_ptr = [ + [t.data_ptr() for t in trt_mod.pre_allocated_outputs] for name, trt_mod in optimized_model.named_children() if "_run_on_acc" in name ] # Run to run, output of intermediate engine is not reallocated - self.assertTrue(output_tensors[0] is new_output_tensors[0]) + self.assertEqual(output_tensors_ptr[0], new_output_tensors_ptr[0]) # Run to run, output of output engine is reallocated - self.assertTrue(output_tensors[1] is not new_output_tensors[1]) - - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_unowned_outputs_api_check( - self, _, use_python_runtime - ): + self.assertNotEqual(output_tensors_ptr[1], new_output_tensors_ptr[1]) + + def test_pre_allocated_outputs_unowned_outputs_api_check(self): class SampleModel(torch.nn.Module): def forward(self, x): return torch.softmax(x * 7 + 2, dim=0) @@ -188,7 +164,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=use_python_runtime, torch_executed_ops={torch.ops.aten.add.Tensor}, ) @@ -207,13 +182,7 @@ def forward(self, x): ) ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_unowned_outputs(self, _, use_python_runtime): + def test_pre_allocated_outputs_unowned_outputs(self): class SampleModel(torch.nn.Module): def forward(self, x): return torch.softmax(x * 7 + 2, dim=0) @@ -229,7 +198,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=use_python_runtime, torch_executed_ops={torch.ops.aten.add.Tensor}, ) @@ -282,7 +250,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, torch_executed_ops={torch.ops.aten.add.Tensor}, ) @@ -306,15 +273,7 @@ def forward(self, x): # Run to run, output of output engine is reallocated self.assertTrue(output_tensors[1] != new_output_tensors[1]) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_api_check( - self, _, use_python_runtime - ): + def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_api_check(self): class SampleModel(torch.nn.Module): def forward(self, x): y = torch.ops.aten.mul(x, 7) @@ -333,7 +292,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=use_python_runtime, torch_executed_ops={torch.ops.aten.add.Tensor}, ) @@ -352,15 +310,7 @@ def forward(self, x): ) ) - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_pre_allocated_outputs_unowned_outputs_multi_outputs( - self, _, use_python_runtime - ): + def test_pre_allocated_outputs_unowned_outputs_multi_outputs(self): class SampleModel(torch.nn.Module): def forward(self, x): y = torch.ops.aten.mul(x, 7) @@ -379,7 +329,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=use_python_runtime, torch_executed_ops={torch.ops.aten.add.Tensor}, )