From ef0662c02af7cf8e87c293f13b8643f4e2f4782b Mon Sep 17 00:00:00 2001 From: Torch-TensorRT Github Bot Date: Thu, 19 Mar 2026 21:16:59 +0000 Subject: [PATCH 1/4] docs: [Automated] Regenerating documenation for d97cb7a Signed-off-by: Torch-TensorRT Github Bot --- .../contributors/complex_number_support.rst | 5 +- docsrc/contributors/cuda_graphs.rst | 4 +- docsrc/debugging/troubleshooting.rst | 12 +- docsrc/py_api/runtime.rst | 16 + docsrc/tutorials/runtime_opt/index.rst | 4 +- .../tutorials/runtime_opt/python_runtime.rst | 152 ++-- examples/apps/flux_demo.py | 4 +- .../data_parallel_stable_diffusion.py | 26 +- .../tensor_parallel_simple_example.py | 24 +- examples/dynamo/autocast_example.py | 28 +- examples/dynamo/custom_kernel_plugins.py | 4 +- examples/dynamo/debugger_example.py | 2 - examples/dynamo/dynamic_memory_allocation.py | 1 - .../dynamo/engine_caching_bert_example.py | 1 - examples/dynamo/engine_caching_example.py | 81 +- examples/dynamo/low_cpu_memory_compilation.py | 2 - .../dynamo/mutable_torchtrt_module_example.py | 30 +- examples/dynamo/refit_engine_example.py | 21 +- .../dynamo/torch_compile_advanced_usage.py | 1 - py/torch_tensorrt/dynamo/_compiler.py | 22 +- py/torch_tensorrt/dynamo/_defaults.py | 1 - py/torch_tensorrt/dynamo/_refit.py | 50 +- py/torch_tensorrt/dynamo/_settings.py | 6 +- .../dynamo/conversion/_conversion.py | 22 +- .../runtime/_MutableTorchTensorRTModule.py | 20 +- .../dynamo/runtime/_PythonTRTEngine.py | 668 ++++++++++++++ .../runtime/_PythonTorchTensorRTModule.py | 813 ------------------ .../runtime/_RuntimeBackendSelection.py | 91 ++ .../dynamo/runtime/_TorchTensorRTModule.py | 462 +++++----- py/torch_tensorrt/dynamo/runtime/__init__.py | 3 - .../runtime/_serialized_engine_layout.py | 70 ++ .../runtime/meta_ops/register_meta_ops.py | 85 +- py/torch_tensorrt/dynamo/utils.py | 48 +- py/torch_tensorrt/runtime/__init__.py | 6 +- .../runtime/_output_allocator.py | 6 +- .../runtime/_pre_allocated_outputs.py | 6 +- py/torch_tensorrt/runtime/_utils.py | 17 +- .../runtime/_weight_streaming.py | 6 +- .../dynamo/backend/test_backend_compiler.py | 3 - .../dynamo/conversion/test_index_put_aten.py | 1 - .../test_distributed_simple_example.py | 1 - .../lowering/test_aten_lowering_passes.py | 2 +- tests/py/dynamo/models/test_autocast.py | 4 - tests/py/dynamo/models/test_dtype_support.py | 6 - tests/py/dynamo/models/test_engine_cache.py | 11 - tests/py/dynamo/models/test_model_refit.py | 323 ++++--- tests/py/dynamo/models/test_models.py | 24 +- .../dynamo/models/test_symint_scalar_input.py | 40 +- .../models/test_weight_stripped_engine.py | 31 +- .../test_000_resource_partitioning.py | 2 - .../test_001_resource_partitioning.py | 12 - .../dynamo/runtime/test_000_python_runtime.py | 3 - .../dynamo/runtime/test_002_cudagraphs_cpp.py | 5 - .../dynamo/runtime/test_002_cudagraphs_py.py | 6 - .../runtime/test_002_lazy_engine_init.py | 5 - tests/py/dynamo/runtime/test_003_safe_mode.py | 2 - .../runtime/test_004_weight_streaming.py | 128 +-- .../runtime/test_005_dynamic_allocation.py | 1 - tests/py/dynamo/runtime/test_empty_input.py | 45 +- .../runtime/test_mutable_torchtrt_module.py | 4 - .../dynamo/runtime/test_output_allocator.py | 186 ++-- .../runtime/test_pre_allocated_outputs.py | 148 ++-- 62 files changed, 1908 insertions(+), 1905 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py delete mode 100644 py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py create mode 100644 py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py create mode 100644 py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py diff --git a/docsrc/contributors/complex_number_support.rst b/docsrc/contributors/complex_number_support.rst index c224c76d6d..3c8ef99050 100644 --- a/docsrc/contributors/complex_number_support.rst +++ b/docsrc/contributors/complex_number_support.rst @@ -128,9 +128,8 @@ runtime modules handle the conversion: * ``prepare_inputs`` (``dynamo/utils.py``) — builds the ``Input`` spec with the ``view_as_real`` shape/dtype but retains the original complex tensor in ``inp.torch_tensor`` for tracing. -* ``_PythonTorchTensorRTModule.forward`` — applies ``torch.view_as_real(i).contiguous()`` - for each complex input before feeding it to the engine. -* ``_TorchTensorRTModule.forward`` — same ``view_as_real`` conversion. +* ``TorchTensorRTModule.forward`` — applies ``torch.view_as_real(i).contiguous()`` + for each complex input before feeding tensors to ``execute_engine`` / ``execute_engine_python``. Key Implementation Invariants ------------------------------- diff --git a/docsrc/contributors/cuda_graphs.rst b/docsrc/contributors/cuda_graphs.rst index 08940fd8e2..6c2369c748 100644 --- a/docsrc/contributors/cuda_graphs.rst +++ b/docsrc/contributors/cuda_graphs.rst @@ -93,8 +93,8 @@ Subsequent inference launches the instantiated graph instead of calling Graph Storage ^^^^^^^^^^^^^ -Each runtime module (both C++ ``TorchTensorRTModule`` and Python -``PythonTorchTensorRTModule``) stores a ``cudaGraphExec_t`` instance. When +``TorchTensorRTModule`` (C++ or Python execution path) may record a CUDA graph for +engine execution when CUDA graphs are enabled at runtime. When ``use_cuda_graph=True`` is set at compile time the runtime records one graph per engine for the first input shape encountered. diff --git a/docsrc/debugging/troubleshooting.rst b/docsrc/debugging/troubleshooting.rst index 7f1c7d2889..0275ead82d 100644 --- a/docsrc/debugging/troubleshooting.rst +++ b/docsrc/debugging/troubleshooting.rst @@ -126,8 +126,10 @@ Runtime Errors the engine. Upgrade TRT or rebuild with ``version_compatible=True``. * The GPU compute capability is lower than on the build machine. Rebuild with ``hardware_compatible=True`` (requires Ampere or newer). - * The ``.ep`` file was generated with ``use_python_runtime=True`` which is not - serializable. Rebuild with the default C++ runtime. + * The ``.ep`` export path does not support your compiled module layout (e.g. mixed + Python-runtime subgraphs in a specific exporter version). Try the default C++ path + at compile time or use ``torch_tensorrt`` module save/load APIs that preserve + ``TorchTensorRTModule`` state. **Shape mismatch at runtime / "Invalid input shape"** @@ -153,9 +155,9 @@ Runtime Errors The model contains data-dependent-shape ops (``nonzero``, ``unique``, ``masked_select``, etc.) which require TRT's output allocator. - * Use ``PythonTorchTensorRTModule`` (``use_python_runtime=True``) — it - activates the dynamic output allocator automatically via - ``requires_output_allocator=True``. + * Use :func:`~torch_tensorrt.runtime.set_runtime_backend` with ``"python"`` or use a module with + ``requires_output_allocator=True`` so the runtime can use TRT's output allocator + on the Python execution path when needed. * See :ref:`cuda_graphs` for ``DynamicOutputAllocator`` details. ---- diff --git a/docsrc/py_api/runtime.rst b/docsrc/py_api/runtime.rst index 719d8f6555..31046d1207 100644 --- a/docsrc/py_api/runtime.rst +++ b/docsrc/py_api/runtime.rst @@ -27,13 +27,29 @@ Functions .. autofunction:: enable_output_allocator +Runtime backend selection +------------------------- + +.. autofunction:: torch_tensorrt.runtime.get_runtime_backend + +.. autofunction:: torch_tensorrt.runtime.set_runtime_backend + Classes --------- .. autoclass:: TorchTensorRTModule :members: :special-members: __init__ + :show-inheritance: + + Single runtime module for TensorRT engines. Dispatches to the C++ or Python execution + implementation based on :func:`~torch_tensorrt.runtime.get_runtime_backend` / + :func:`~torch_tensorrt.runtime.set_runtime_backend`. See :ref:`python_runtime`. .. autoclass:: PythonTorchTensorRTModule :members: :special-members: __init__ + :show-inheritance: + + Subclass of ``TorchTensorRTModule`` that **pins** the Python engine path. Prefer + ``TorchTensorRTModule`` plus compile flags unless you need this guarantee. See :ref:`python_runtime`. diff --git a/docsrc/tutorials/runtime_opt/index.rst b/docsrc/tutorials/runtime_opt/index.rst index b4b3c9af8a..007bd1f645 100644 --- a/docsrc/tutorials/runtime_opt/index.rst +++ b/docsrc/tutorials/runtime_opt/index.rst @@ -2,7 +2,7 @@ Runtime Optimization ===================== Optimize inference throughput and latency: CUDA Graphs for kernel-replay, -pre-allocated output buffers, and the Python runtime module. +pre-allocated output buffers, and choosing the Python vs C++ TRT execution path. .. toctree:: :maxdepth: 1 @@ -10,4 +10,4 @@ pre-allocated output buffers, and the Python runtime module. cuda_graphs Example: Torch Export with Cudagraphs <../_rendered_examples/dynamo/torch_export_cudagraphs> Example: Pre-allocated output buffer <../_rendered_examples/dynamo/pre_allocated_output_example> - python_runtime + Python vs C++ runtime diff --git a/docsrc/tutorials/runtime_opt/python_runtime.rst b/docsrc/tutorials/runtime_opt/python_runtime.rst index 2c97b941d4..f07989971f 100644 --- a/docsrc/tutorials/runtime_opt/python_runtime.rst +++ b/docsrc/tutorials/runtime_opt/python_runtime.rst @@ -1,96 +1,103 @@ .. _python_runtime: -Python Runtime -============== +Python vs C++ runtime +===================== -Torch-TensorRT provides two runtime backends for executing compiled TRT engines -inside a PyTorch graph: +Torch-TensorRT uses a single module type, :class:`~torch_tensorrt.runtime.TorchTensorRTModule`, +to run TensorRT engines inside PyTorch. The **execution path** (which code actually drives +``execute_async``) is selected at runtime: -* **C++ runtime** (default) — ``TorchTensorRTModule`` backed by a C++ TorchBind class. - Fully serializable, supports CUDAGraphs, multi-device safe. -* **Python runtime** — ``PythonTorchTensorRTModule`` backed entirely by the TRT Python - API. Simpler to instrument for debugging but **not serializable** to - ``ExportedProgram``. +* **C++ path** — ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. + Preferred for production when the Torch-TensorRT C++ extension is available: TorchScript-friendly, + and integrates with the full C++ runtime stack. +* **Python path** — internal ``PythonTRTEngine`` plus + ``torch.ops.tensorrt.execute_engine_python``. Useful when the C++ extension is absent, or when + you want easier Python-level debugging and instrumentation. + +:class:`~torch_tensorrt.runtime.PythonTorchTensorRTModule` is a **thin subclass** of +``TorchTensorRTModule`` that **pins** the Python path (same constructor and behavior, but always +resolves to the Python engine). Prefer ``TorchTensorRTModule`` plus the global backend APIs below +when you do not need that pin. ---- -When to Use the Python Runtime --------------------------------- +When to use the Python path +--------------------------- -Use ``use_python_runtime=True`` when: +Use :func:`~torch_tensorrt.runtime.set_runtime_backend` (typically as a context manager) when: -* You need to run on a machine where the C++ Torch-TensorRT library is not installed - (e.g., a minimal CI container with only the Python wheel). -* You want to attach Python-level callbacks to the engine execution (via - :ref:`observer`) for debugging or profiling without building the C++ extension. -* You are debugging a conversion issue and want to step through TRT execution in Python. +* The C++ Torch-TensorRT library is not installed (e.g. a minimal environment with only the Python pieces). +* You want Python-level hooks (e.g. :ref:`observer`) without relying on the C++ extension. +* You are debugging conversion or execution and want to break inside the Python TRT wrapper. -Use the default C++ runtime in all other cases, especially: +Prefer the C++ path when: -* When saving a compiled module to disk (``torch_tensorrt.save()``). -* When using CUDAGraphs for low-latency inference. -* In production deployments. +* You rely on the default Torch-TensorRT deployment story and maximum parity with TorchScript export. +* You use whole-graph CUDAGraph wrappers that assume the C++ runtime (see :ref:`cuda_graphs`). ---- -Enabling the Python Runtime ------------------------------ +Enabling the Python path +------------------------ + +**Process-wide default (context manager)** .. code-block:: python - import torch_tensorrt + import torch_tensorrt as tt - trt_gm = torch_tensorrt.dynamo.compile( - exported_program, - arg_inputs=inputs, - use_python_runtime=True, - ) + with tt.runtime.set_runtime_backend("python"): + trt_gm = tt.dynamo.compile(exported_program, inputs) -Or via ``torch.compile``: +**``torch.compile``** (same context manager around compile / first run) .. code-block:: python - trt_model = torch.compile( - model, - backend="tensorrt", - options={"use_python_runtime": True}, - ) + import torch_tensorrt as tt ----- + with tt.runtime.set_runtime_backend("python"): + trt_model = torch.compile(model, backend="tensorrt", options={}) -Limitations ------------ +The context manager does **not** replace :class:`~torch_tensorrt.runtime.PythonTorchTensorRTModule`, +which always requests the Python path via a class-level pin. -* **Not serializable**: ``PythonTorchTensorRTModule`` cannot be saved via - ``torch_tensorrt.save()`` as an ``ExportedProgram`` or loaded back. The module is - Python-only in-process. +---- - .. code-block:: python +Serialization +--------------- - # This will raise an error with use_python_runtime=True: - torch_tensorrt.save(trt_gm, "model.ep", arg_inputs=inputs) +Module state records which backend was used (``runtime_backend`` in packed metadata). After load, +``TorchTensorRTModule`` reconstructs either the C++ engine or the Python engine wrapper +as appropriate. Some **export** workflows (e.g. certain ``ExportedProgram`` save paths) may still +assume a C++-only graph; validate your deployment path if you mix Python execution with AOT export. -* **No C++ deployment**: The compiled module cannot be exported to AOTInductor or used - in a C++ application without re-compiling with the C++ runtime. +---- -* **CUDAGraphs**: Whole-graph CUDAGraphs work with the Python runtime, but the - per-submodule CUDAGraph recording in ``CudaGraphsTorchTensorRTModule`` is - only available with the C++ runtime. +Limitations +----------- + +* **C++ deployment**: A module that executed on the Python path still needs TensorRT and the + Torch-TensorRT Python pieces available in-process unless you recompile targeting the C++ path. +* **CUDAGraphs**: Whole-graph CUDAGraph wrappers may assume the C++ runtime for some configurations; + see :ref:`cuda_graphs`. +* **Explicit allocator engines**: Engines with data-dependent outputs may set + ``requires_output_allocator=True``; the unified module supports the output-allocator execution + mode on the Python path. See :ref:`cuda_graphs` for interaction with CUDA graphs. ---- -``PythonTorchTensorRTModule`` Direct Instantiation ----------------------------------------------------- +``PythonTorchTensorRTModule`` direct instantiation +-------------------------------------------------- -You can instantiate ``PythonTorchTensorRTModule`` directly from raw engine bytes, -for example when integrating a TRT engine built outside of Torch-TensorRT: +You can instantiate :class:`~torch_tensorrt.runtime.PythonTorchTensorRTModule` from raw engine bytes +when you need a **guaranteed** Python execution path (e.g. integrating an engine built outside +Torch-TensorRT): .. code-block:: python from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule from torch_tensorrt.dynamo._settings import CompilationSettings - # Load raw engine bytes (e.g., from trtexec output or torch_tensorrt.dynamo.convert_*) with open("model.engine", "rb") as f: engine_bytes = f.read() @@ -104,37 +111,32 @@ for example when integrating a TRT engine built outside of Torch-TensorRT: output = module(torch.randn(1, 3, 224, 224).cuda()) -**Constructor arguments:** +**Constructor arguments** (same as ``TorchTensorRTModule``): ``serialized_engine`` (``bytes``) - The raw serialized TRT engine bytes. - -``input_binding_names`` (``List[str]``) - TRT input binding names in the order they are passed to ``forward()``. + Raw serialized TRT engine. -``output_binding_names`` (``List[str]``) - TRT output binding names in the order they should be returned. +``input_binding_names`` / ``output_binding_names`` (``List[str]``) + Binding names in ``forward`` order. ``name`` (``str``, optional) - Human-readable name for the module (used in logging). + Name for logging and serialization. -``settings`` (``CompilationSettings``, optional) - The compilation settings used to build the engine. Used to determine device - placement and other runtime behaviors. +``settings`` (:class:`~torch_tensorrt.dynamo._settings.CompilationSettings`, optional) + Device and runtime options (must match how the engine was built). ``weight_name_map`` (``dict``, optional) - Mapping of TRT weight names to PyTorch state dict names. Required for refit - support via :func:`~torch_tensorrt.dynamo.refit_module_weights`. + For refit workflows; see :func:`~torch_tensorrt.dynamo.refit_module_weights`. -``requires_output_allocator`` (``bool``, default ``False``) - Set to ``True`` if the engine contains data-dependent-shape ops (``nonzero``, - ``unique``, etc.) that require TRT's output allocator. +``requires_output_allocator`` (``bool``) + Set ``True`` for data-dependent-shape ops that need TRT's output allocator. ---- -Runtime Selection Logic ------------------------- +Runtime selection summary +------------------------- -When ``use_python_runtime`` is ``None`` (auto-select), Torch-TensorRT tries to import -the C++ TorchBind class. If the C++ extension is not available it silently falls back to -the Python runtime. Pass ``True`` or ``False`` to force a specific runtime. +* :func:`~torch_tensorrt.runtime.get_runtime_backend` / :func:`~torch_tensorrt.runtime.set_runtime_backend` + — process default for newly created ``TorchTensorRTModule`` instances (unless a subclass pins a backend). + Use ``set_runtime_backend`` as a context manager to scope C++ vs Python for compile and forward. +* If the C++ extension is **not** built, only the Python path is available. diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py index e58787ff79..1edc3ad5e5 100644 --- a/examples/apps/flux_demo.py +++ b/examples/apps/flux_demo.py @@ -125,7 +125,6 @@ def forward_loop(mod): "enabled_precisions": enabled_precisions, "truncate_double": True, "min_block_size": 1, - "use_python_runtime": True, "immutable_weights": False, "offload_module_to_cpu": args.low_vram_mode, "use_explicit_typing": use_explicit_typing, @@ -136,7 +135,8 @@ def forward_loop(mod): remove_hook_from_module(pipe.transformer, recurse=True) pipe.transformer.to(DEVICE) - trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings) + with torch_tensorrt.runtime.set_runtime_backend("python"): + trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings) if dynamic_shapes: trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes) pipe.transformer = trt_gm diff --git a/examples/distributed_inference/data_parallel_stable_diffusion.py b/examples/distributed_inference/data_parallel_stable_diffusion.py index 023d7e8e63..9171677f13 100644 --- a/examples/distributed_inference/data_parallel_stable_diffusion.py +++ b/examples/distributed_inference/data_parallel_stable_diffusion.py @@ -31,19 +31,19 @@ backend = "torch_tensorrt" # Optimize the UNet portion with Torch-TensorRT -pipe.unet = torch.compile( # %% - # Inference - # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - # Assume there are 2 processes (2 devices) - pipe.unet, - backend=backend, - options={ - "truncate_long_and_double": True, - "precision": torch.float16, - "use_python_runtime": True, - }, - dynamic=False, -) +with torch_tensorrt.runtime.set_runtime_backend("python"): + pipe.unet = torch.compile( # %% + # Inference + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # Assume there are 2 processes (2 devices) + pipe.unet, + backend=backend, + options={ + "truncate_long_and_double": True, + "precision": torch.float16, + }, + dynamic=False, + ) torch_tensorrt.runtime.set_multi_device_safe_mode(True) diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py index f2dc6861cb..e6c622c6fc 100755 --- a/examples/distributed_inference/tensor_parallel_simple_example.py +++ b/examples/distributed_inference/tensor_parallel_simple_example.py @@ -93,18 +93,18 @@ def forward(self, x): python_result = tp_model(inp) backend = "torch_tensorrt" -tp_model = torch.compile( - tp_model, - backend=backend, - options={ - "truncate_long_and_double": True, - "enabled_precisions": {torch.float32, torch.float16}, - "use_python_runtime": True, - "min_block_size": 1, - "use_distributed_mode_trace": True, - }, - dynamic=None, -) +with torch_tensorrt.runtime.set_runtime_backend("python"): + tp_model = torch.compile( + tp_model, + backend=backend, + options={ + "truncate_long_and_double": True, + "enabled_precisions": {torch.float32, torch.float16}, + "min_block_size": 1, + "use_distributed_mode_trace": True, + }, + dynamic=None, + ) # For TP, input needs to be same across all TP ranks. # Setting the random seed is to mimic the behavior of dataloader. diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py index b2b5509c82..7f3db58f56 100644 --- a/examples/dynamo/autocast_example.py +++ b/examples/dynamo/autocast_example.py @@ -69,20 +69,20 @@ def forward(self, x): # ``autocast_max_output_threshold``, ``autocast_max_depth_of_reduction``, and ``autocast_calibration_dataloader``. Please refer to # the documentation for more details. -trt_autocast_mod = torch_tensorrt.compile( - ep.module(), - arg_inputs=inputs, - min_block_size=1, - use_python_runtime=True, - use_explicit_typing=True, - enable_autocast=True, - autocast_low_precision_type=torch.bfloat16, - autocast_excluded_nodes={"^conv1$", "relu"}, - autocast_excluded_ops={"torch.ops.aten.flatten.using_ints"}, - autocast_max_output_threshold=512, - autocast_max_depth_of_reduction=None, - autocast_calibration_dataloader=calibration_dataloader, -) +with torch_tensorrt.runtime.set_runtime_backend("python"): + trt_autocast_mod = torch_tensorrt.compile( + ep.module(), + arg_inputs=inputs, + min_block_size=1, + use_explicit_typing=True, + enable_autocast=True, + autocast_low_precision_type=torch.bfloat16, + autocast_excluded_nodes={"^conv1$", "relu"}, + autocast_excluded_ops={"torch.ops.aten.flatten.using_ints"}, + autocast_max_output_threshold=512, + autocast_max_depth_of_reduction=None, + autocast_calibration_dataloader=calibration_dataloader, + ) autocast_outs = trt_autocast_mod(*inputs) diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py index e7711e2ecb..f79f05a3ea 100644 --- a/examples/dynamo/custom_kernel_plugins.py +++ b/examples/dynamo/custom_kernel_plugins.py @@ -277,7 +277,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Node: torch.ops.torchtrt_ex.triton_circular_pad.default, with layer location: __/triton_circular_pad # Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner # -# Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False) +# Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False) # # Graph Structure: # @@ -580,7 +580,7 @@ def circular_padding_converter( # # The graph consists of 2 Total Operators, of which 2 operators are supported, 100.0% coverage # -# Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False) +# Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False) # # Graph Structure: # diff --git a/examples/dynamo/debugger_example.py b/examples/dynamo/debugger_example.py index deeee1b9da..f4320ee6ff 100644 --- a/examples/dynamo/debugger_example.py +++ b/examples/dynamo/debugger_example.py @@ -36,7 +36,6 @@ enabled_precisions = {torch.float} workspace_size = 20 << 30 min_block_size = 0 -use_python_runtime = False torch_executed_ops = {} with torch_trt.dynamo.Debugger( @@ -53,7 +52,6 @@ trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=use_python_runtime, enabled_precisions=enabled_precisions, min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, diff --git a/examples/dynamo/dynamic_memory_allocation.py b/examples/dynamo/dynamic_memory_allocation.py index b7d22ad0b2..426e7b770c 100644 --- a/examples/dynamo/dynamic_memory_allocation.py +++ b/examples/dynamo/dynamic_memory_allocation.py @@ -46,7 +46,6 @@ settings = { "ir": "dynamo", - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, "lazy_engine_init": True, diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index 6aa90302e3..97d3c0d78e 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -47,7 +47,6 @@ def compile_bert(iterations=3): start.record() compilation_kwargs = { - "use_python_runtime": False, "enabled_precisions": {torch.float}, "truncate_double": True, "min_block_size": 1, diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 45bcd363ab..87e108d46f 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -40,7 +40,6 @@ model = models.resnet18(pretrained=True).to("cuda").eval() enabled_precisions = {torch.float} min_block_size = 1 -use_python_runtime = False def remove_timing_cache(path=TIMING_CACHE_PATH): @@ -88,20 +87,20 @@ def torch_compile(iterations=3): reuse_cached_engines = True start.record() - compiled_model = torch.compile( - model, - backend="tensorrt", - options={ - "use_python_runtime": True, - "enabled_precisions": enabled_precisions, - "min_block_size": min_block_size, - "immutable_weights": False, - "cache_built_engines": cache_built_engines, - "reuse_cached_engines": reuse_cached_engines, - }, - ) - with torch.no_grad(): - compiled_model(*inputs) # trigger the compilation + with torch_trt.runtime.set_runtime_backend("python"): + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "enabled_precisions": enabled_precisions, + "min_block_size": min_block_size, + "immutable_weights": False, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + }, + ) + with torch.no_grad(): + compiled_model(*inputs) # trigger the compilation end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) @@ -149,17 +148,17 @@ def dynamo_compile(iterations=3): reuse_cached_engines = True start.record() - trt_gm = torch_trt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - cache_built_engines=cache_built_engines, - reuse_cached_engines=reuse_cached_engines, - engine_cache_size=1 << 30, # 1GB - ) + with torch_trt.runtime.set_runtime_backend("cpp"): + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, + engine_cache_size=1 << 30, # 1GB + ) # output = trt_gm(*inputs) end.record() torch.cuda.synchronize() @@ -258,21 +257,21 @@ def torch_compile_my_cache(iterations=3): reuse_cached_engines = True start.record() - compiled_model = torch.compile( - model, - backend="tensorrt", - options={ - "use_python_runtime": True, - "enabled_precisions": enabled_precisions, - "min_block_size": min_block_size, - "immutable_weights": False, - "cache_built_engines": cache_built_engines, - "reuse_cached_engines": reuse_cached_engines, - "custom_engine_cache": engine_cache, - }, - ) - with torch.no_grad(): - compiled_model(*inputs) # trigger the compilation + with torch_trt.runtime.set_runtime_backend("python"): + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "enabled_precisions": enabled_precisions, + "min_block_size": min_block_size, + "immutable_weights": False, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": engine_cache, + }, + ) + with torch.no_grad(): + compiled_model(*inputs) # trigger the compilation end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) diff --git a/examples/dynamo/low_cpu_memory_compilation.py b/examples/dynamo/low_cpu_memory_compilation.py index 4780bdf208..2de315dca8 100644 --- a/examples/dynamo/low_cpu_memory_compilation.py +++ b/examples/dynamo/low_cpu_memory_compilation.py @@ -55,10 +55,8 @@ def forward(self, x): inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] enabled_precisions = {torch.float} -use_python_runtime = False compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py index 84abcddf44..b8f8a86e7f 100644 --- a/examples/dynamo/mutable_torchtrt_module_example.py +++ b/examples/dynamo/mutable_torchtrt_module_example.py @@ -32,13 +32,13 @@ # Initialize the Mutable Torch TensorRT Module with settings. # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ settings = { - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, } model = models.resnet18(pretrained=True).to("cuda").eval() -mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings) +with torch_trt.runtime.set_runtime_backend("cpp"): + mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings) # You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module. with torch.no_grad(): mutable_module(*inputs) @@ -67,7 +67,7 @@ # Saving Mutable Torch TensorRT Module # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Currently, saving is only enabled when "use_python_runtime" = False in settings +# Saving requires a C++-runtime compiled graph (see MutableTorchTensorRTModule.save). torch_trt.MutableTorchTensorRTModule.save(mutable_module, "mutable_module.pkl") reload = torch_trt.MutableTorchTensorRTModule.load("mutable_module.pkl") @@ -78,7 +78,6 @@ with torch.no_grad(): settings = { - "use_python_runtime": True, "enabled_precisions": {torch.float16}, "immutable_weights": False, } @@ -93,7 +92,8 @@ pipe.to(device) # The only extra line you need - pipe.unet = torch_trt.MutableTorchTensorRTModule(pipe.unet, **settings) + with torch_trt.runtime.set_runtime_backend("python"): + pipe.unet = torch_trt.MutableTorchTensorRTModule(pipe.unet, **settings) BATCH = torch.export.Dim("BATCH", min=2, max=24) _HEIGHT = torch.export.Dim("_HEIGHT", min=16, max=32) _WIDTH = torch.export.Dim("_WIDTH", min=16, max=32) @@ -209,16 +209,16 @@ def forward(self, a, b, c={}): end = torch.cuda.Event(enable_timing=True) example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) -model = torch_trt.MutableTorchTensorRTModule( - model, - use_python_runtime=True, - enabled_precisions={torch.float}, - min_block_size=1, - immutable_weights=False, - cache_built_engines=True, - reuse_cached_engines=True, - engine_cache_size=1 << 30, # 1GB -) +with torch_trt.runtime.set_runtime_backend("python"): + model = torch_trt.MutableTorchTensorRTModule( + model, + enabled_precisions={torch.float}, + min_block_size=1, + immutable_weights=False, + cache_built_engines=True, + reuse_cached_engines=True, + engine_cache_size=1 << 30, # 1GB + ) def remove_timing_cache(path=TIMING_CACHE_PATH): diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py index 26e17ff809..45d342ffa3 100644 --- a/examples/dynamo/refit_engine_example.py +++ b/examples/dynamo/refit_engine_example.py @@ -58,18 +58,17 @@ enabled_precisions = {torch.float} workspace_size = 20 << 30 min_block_size = 0 -use_python_runtime = False torch_executed_ops = {} -trt_gm = torch_trt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - torch_executed_ops=torch_executed_ops, - immutable_weights=False, - reuse_cached_engines=False, -) # Output is a torch.fx.GraphModule +with torch_trt.runtime.set_runtime_backend("cpp"): + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + torch_executed_ops=torch_executed_ops, + immutable_weights=False, + reuse_cached_engines=False, + ) # Output is a torch.fx.GraphModule # Save the graph module as an exported program torch_trt.save(trt_gm, "./compiled.ep", inputs=inputs) diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py index b366c292d9..8d1051f54e 100644 --- a/examples/dynamo/torch_compile_advanced_usage.py +++ b/examples/dynamo/torch_compile_advanced_usage.py @@ -77,7 +77,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): "min_block_size": 2, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "optimization_level": 4, - "use_python_runtime": False, } # Run the model on an input to cause compilation, as so: diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index bc3cdc5721..846724ddef 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -43,6 +43,10 @@ from torch_tensorrt.dynamo.partitioning._resource_partitioner import ( resource_partition, ) +from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( + RuntimeBackend, + get_runtime_backend, +) from torch_tensorrt.dynamo.utils import ( deallocate_module, get_cpu_memory_usage, @@ -86,7 +90,7 @@ def cross_compile_for_windows( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, # Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -165,7 +169,7 @@ def cross_compile_for_windows( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -319,7 +323,6 @@ def cross_compile_for_windows( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": False, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, @@ -353,7 +356,6 @@ def cross_compile_for_windows( # disable the following settings is not supported for cross compilation for windows feature unsupported_settings = ( - "use_python_runtime", "lazy_engine_init", "cache_built_engines", "reuse_cached_engines", @@ -432,7 +434,7 @@ def compile( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, # Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -526,7 +528,7 @@ def compile( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -723,7 +725,6 @@ def compile( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": use_python_runtime, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, @@ -1053,7 +1054,7 @@ def preserve_module_specs( if _debugger_config: if _debugger_config.save_engine_profile: - if settings.use_python_runtime: + if get_runtime_backend() is RuntimeBackend.PYTHON: if _debugger_config.profile_format != "cudagraph": raise ValueError( "Profiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization." @@ -1144,7 +1145,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, + use_python_runtime: bool = False, # Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -1219,7 +1220,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization + use_python_runtime: (bool): Does nothing. Kept for backward compatibility; use ``with torch_tensorrt.runtime.set_runtime_backend(backend=...):`` instead. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -1382,7 +1383,6 @@ def convert_exported_program_to_serialized_trt_engine( "max_aux_streams": max_aux_streams, "version_compatible": version_compatible, "optimization_level": optimization_level, - "use_python_runtime": use_python_runtime, "truncate_double": truncate_double, "use_fast_partitioner": use_fast_partitioner, "num_avg_timing_iters": num_avg_timing_iters, diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 2e838cd28c..598a8f04ac 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -23,7 +23,6 @@ OPTIMIZATION_LEVEL = None SPARSE_WEIGHTS = False TRUNCATE_DOUBLE = False -USE_PYTHON_RUNTIME = False USE_FAST_PARTITIONER = True ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False REQUIRE_FULL_COMPILATION = False diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 0b6af849fa..1eed6edc80 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -32,14 +32,12 @@ post_lowering, pre_export_lowering, ) -from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( - PythonTorchTensorRTModule, -) -from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( +from torch_tensorrt.dynamo.runtime._PythonTRTEngine import PythonTRTEngine +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( ENGINE_IDX, SERIALIZED_METADATA_IDX, - TorchTensorRTModule, ) +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ( CPU_DEVICE, check_module_output, @@ -282,11 +280,7 @@ def refit_module_weights( if ( not isinstance( submodule, - ( - PythonTorchTensorRTModule, - TorchTensorRTModule, - torch.nn.modules.module.Module, - ), + (TorchTensorRTModule, torch.nn.modules.module.Module), ) or "_run_on_gpu" in name ): @@ -481,9 +475,13 @@ def refit_module_weights( except AttributeError: if isinstance(compiled_submodule, torch.nn.Module): # Torch retrace module - assert ( - not settings.use_python_runtime - ), "Refitting a torch retraced module is only supported with use_python_runtime=False" + assert not isinstance( + compiled_submodule.engine, + PythonTRTEngine, + ), ( + "Refitting a torch retraced module is only supported when " + "the engine uses the C++ Torch-TensorRT runtime" + ) encoded_metadata = [ engine for name, engine in compiled_submodules @@ -505,10 +503,10 @@ def refit_module_weights( "This engine does not have a weight map cache. Rebuilding the weight map" ) - # Rexporting the TRT compiled graph module and loading it back doesn't preserve the instance type and registers - # the compiled submodule as torch.nn.Module. So we use settings.use_python_runtime to determine the instance type. - if settings.use_python_runtime: - engine = compiled_submodule.engine + # Rexporting the TRT compiled graph module and loading it back doesn't preserve + # the instance type; choose the engine handle based on the actual engine object. + if isinstance(compiled_submodule.engine, PythonTRTEngine): + engine = compiled_submodule.engine.cuda_engine else: engine_info = compiled_submodule.engine.__getstate__()[0] engine = get_engine_from_encoded_engine( @@ -564,12 +562,18 @@ def refit_module_weights( serialization_config.set_flag(trt.SerializationFlag.INCLUDE_REFIT) serialized_engine = engine.serialize_with_config(serialization_config) - if isinstance(compiled_submodule, PythonTorchTensorRTModule): - compiled_submodule.serialized_engine = bytes(serialized_engine) - elif isinstance(compiled_submodule, TorchTensorRTModule): - compiled_submodule.engine = None # Clear the engine for TorchTensorRTModule, otherwise it won't be updated - compiled_submodule.serialized_engine = bytes(serialized_engine) - compiled_submodule.setup_engine() + if isinstance(compiled_submodule, TorchTensorRTModule): + new_serialized_engine = bytes(serialized_engine) + compiled_submodule.serialized_engine = new_serialized_engine + if compiled_submodule._is_python_runtime: + # Refit already updated ``cuda_engine`` in place; avoid deserialize (slow). + py_eng = compiled_submodule.engine + if isinstance(py_eng, PythonTRTEngine): + py_eng.serialized_info[ENGINE_IDX] = new_serialized_engine + py_eng.serialized_engine = new_serialized_engine + else: + compiled_submodule._cleanup_engine() + compiled_submodule.setup_engine() elif inline_module: new_engine_info = list(engine_info) new_engine_info[ENGINE_IDX] = bytes(serialized_engine) diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index e3f2f1bc37..695e124610 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -52,7 +52,6 @@ USE_EXPLICIT_TYPING, USE_FAST_PARTITIONER, USE_FP32_ACC, - USE_PYTHON_RUNTIME, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -74,9 +73,6 @@ class CompilationSettings: version_compatible (bool): Provide version forward-compatibility for engine plan files optimization_level (Optional[int]): Builder optimization 0-5, higher levels imply longer build time, searching for more optimization options. TRT defaults to 3 - use_python_runtime (Optional[bool]): Whether to strictly use Python runtime or C++ runtime. To auto-select a runtime - based on C++ dependency presence (preferentially choosing C++ runtime if available), leave the - argument as None truncate_double (bool): Whether to truncate float64 TRT engine inputs or weights to float32 use_fast_partitioner (bool): Whether to use the fast or global graph partitioning system enable_experimental_decompositions (bool): Whether to enable all core aten decompositions @@ -130,7 +126,6 @@ class CompilationSettings: max_aux_streams: Optional[int] = MAX_AUX_STREAMS version_compatible: bool = VERSION_COMPATIBLE optimization_level: Optional[int] = OPTIMIZATION_LEVEL - use_python_runtime: Optional[bool] = USE_PYTHON_RUNTIME truncate_double: bool = TRUNCATE_DOUBLE use_fast_partitioner: bool = USE_FAST_PARTITIONER enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS @@ -194,6 +189,7 @@ def __getstate__(self) -> dict[str, Any]: return state def __setstate__(self, state: dict[str, Any]) -> None: + state.pop("use_python_runtime", None) self.__dict__.update(state) diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index e47d3f404f..d6628228fe 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -4,6 +4,7 @@ import logging from typing import Any, Dict, List, NamedTuple, Optional, Sequence +import tensorrt as trt import torch from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES @@ -17,7 +18,7 @@ TRTInterpreter, TRTInterpreterResult, ) -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ( get_cpu_memory_usage, get_output_dtypes, @@ -25,8 +26,6 @@ ) from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt - logger = logging.getLogger(__name__) @@ -329,7 +328,7 @@ def convert_module( settings: CompilationSettings = CompilationSettings(), name: str = "", engine_cache: Optional[BaseEngineCache] = None, -) -> PythonTorchTensorRTModule | TorchTensorRTModule: +) -> TorchTensorRTModule: """Convert an FX module to a TRT module Args: module: FX GraphModule to convert @@ -338,27 +337,18 @@ def convert_module( name: TRT engine name engine_cache: Engine cache instance Returns: - PythonTorchTensorRTModule or TorchTensorRTModule + TorchTensorRTModule """ serialized_interpreter_result = interpret_module_to_result( module, inputs, settings, engine_cache=engine_cache ) - rt_cls = PythonTorchTensorRTModule - - if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime: - from torch_tensorrt.dynamo.runtime import TorchTensorRTModule - - rt_cls = TorchTensorRTModule - - elif ( - not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime - ): + if not ENABLED_FEATURES.torch_tensorrt_runtime: logger.info( "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available" ) - return rt_cls( + return TorchTensorRTModule( serialized_engine=serialized_interpreter_result.serialized_engine, input_binding_names=list(serialized_interpreter_result.input_names), output_binding_names=list(serialized_interpreter_result.output_names), diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index d3ef7e0a41..374c02cc8c 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -14,6 +14,8 @@ from torch_tensorrt.dynamo import _defaults from torch_tensorrt.dynamo._compiler import compile as dynamo_compile from torch_tensorrt.dynamo._refit import refit_module_weights +from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import RuntimeBackend +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ( check_output_equal, deallocate_module, @@ -65,7 +67,6 @@ def __init__( pytorch_model: torch.nn.Module, *, device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE, - use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, @@ -105,7 +106,6 @@ def __init__( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -143,7 +143,6 @@ def __init__( self.prefer_deferred_runtime_asserts_over_guards = ( prefer_deferred_runtime_asserts_over_guards ) - self.use_python_runtime = use_python_runtime self.trt_device = to_torch_tensorrt_device(device) assert ( not immutable_weights @@ -367,7 +366,6 @@ def compile(self) -> None: arg_inputs=self.arg_inputs, kwarg_inputs=self.kwarg_inputs, immutable_weights=False, - use_python_runtime=self.use_python_runtime, enabled_precisions=self.enabled_precisions, **self.additional_settings, ) @@ -699,11 +697,21 @@ def resursivly_deserialize_dynamic_shape(obj: Any) -> None: resursivly_deserialize_dynamic_shape(self.arg_dynamic_shapes) resursivly_deserialize_dynamic_shape(self.kwarg_dynamic_shapes) + @staticmethod + def _compiled_graph_uses_python_runtime(gm: Any) -> bool: + for m in gm.modules(): + if ( + isinstance(m, TorchTensorRTModule) + and m._runtime_backend is RuntimeBackend.PYTHON + ): + return True + return False + @staticmethod def save(module: Any, path: str) -> None: # Cast the object back to MutableTorchTensorRTModule to save - assert ( - not module.use_python_runtime + assert not MutableTorchTensorRTModule._compiled_graph_uses_python_runtime( + module.gm ), "Python runtime does not support serialization. Save failed." module.init_finished = False module.__class__ = MutableTorchTensorRTModule diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py new file mode 100644 index 0000000000..5d53e0d394 --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py @@ -0,0 +1,668 @@ +"""Python-side TensorRT engine: deserialize, execute, and drive ``execute_engine_python``. + +Serialization layout lives in :mod:`torch_tensorrt.dynamo.runtime._serialized_engine_layout`. +The engine is passed into ``tensorrt::execute_engine_python`` as an opaque reference (see +``register_opaque_type``), analogous to ``tensorrt::Engine`` for the C++ ``execute_engine`` op. +""" + +from __future__ import annotations + +import base64 +import copy +import logging +import pickle +import tempfile +from contextlib import nullcontext +from types import SimpleNamespace +from typing import Any, ContextManager, Dict, List, Optional, Sequence, Tuple, cast + +import tensorrt as trt +import torch +import torch_tensorrt +from torch._library.opaque_object import register_opaque_type +from torch_tensorrt._enums import dtype +from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR +from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + ABI_TARGET_IDX, + DEVICE_IDX, + ENGINE_IDX, + HW_COMPATIBLE_IDX, + INPUT_BINDING_NAMES_IDX, + NAME_IDX, + OUTPUT_BINDING_NAMES_IDX, + REQUIRES_OUTPUT_ALLOCATOR_IDX, + RESOURCE_ALLOCATION_STRATEGY_IDX, + SERIALIZATION_LEN, + SERIALIZED_METADATA_IDX, + TARGET_PLATFORM_IDX, + SerializedTensorRTEngineFmt, + deserialize_binding_names, + parse_device_info, +) +from torch_tensorrt.logging import TRT_LOGGER +from torch_tensorrt.runtime._utils import ( + _is_switch_required, + _select_rt_device, + multi_gpu_device_check, +) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# TRT I/O helpers +# --------------------------------------------------------------------------- + + +class DynamicOutputAllocator(trt.IOutputAllocator): # type: ignore[misc] + def __init__(self, output_dtypes: Dict[str, torch.dtype]) -> None: + trt.IOutputAllocator.__init__(self) + self.buffers: Dict[str, torch.Tensor] = {} + self.shapes: Dict[str, Tuple[int, ...]] = {} + self.dtypes: Dict[str, torch.dtype] = output_dtypes + + def reallocate_output_async( + self, + tensor_name: str, + memory: int, + size: int, + alignment: int, + stream: torch.cuda.Stream, + ) -> Any: + shape = (size,) + if tensor_name not in self.buffers or self.buffers[tensor_name].shape != shape: + self.buffers[tensor_name] = torch.empty( + shape, + dtype=self.dtypes[tensor_name], + device=torch.cuda.current_device(), + ) + return self.buffers[tensor_name].data_ptr() + + def notify_shape(self, tensor_name: str, shape: Tuple[int, ...]) -> None: + self.shapes[tensor_name] = tuple(shape) + + +class TorchTRTRuntimeStates: + """Tracks CUDA graph / pre-allocated-output state across invocations.""" + + def __init__(self, new_cudagraphs: bool): + self.old_cudagraphs = new_cudagraphs + self.old_pre_allocated_outputs = False + self.context_changed = False + + def set_runtime_states( + self, + new_cudagraphs: bool, + new_pre_allocated_output: bool, + shape_changed: bool, + ) -> Tuple[bool, bool, bool]: + need_cudagraphs_record = False + can_use_pre_allocated_outputs = False + need_cudagraphs_reset = False + + if new_cudagraphs and ( + not self.old_cudagraphs or shape_changed or self.context_changed + ): + need_cudagraphs_record = True + + if ( + self.old_pre_allocated_outputs + and new_pre_allocated_output + and (not shape_changed) + ): + can_use_pre_allocated_outputs = True + + if not new_cudagraphs or shape_changed or self.context_changed: + need_cudagraphs_reset = True + + self.old_cudagraphs = new_cudagraphs + self.old_pre_allocated_outputs = new_pre_allocated_output + self.context_changed = False + + return ( + need_cudagraphs_record, + can_use_pre_allocated_outputs, + need_cudagraphs_reset, + ) + + +# --------------------------------------------------------------------------- +# PythonTRTEngine +# --------------------------------------------------------------------------- + + +class PythonTRTEngine: + """TensorRT engine + execution context, driven from Python TRT APIs.""" + + # --- construction / teardown --- + + def __init__( + self, + serialized_info: SerializedTensorRTEngineFmt, + *, + profile_execution: bool = False, + ) -> None: + self._profile_execution = profile_execution + self.profile_path_prefix = tempfile.gettempdir() + self.use_pre_allocated_outputs = False + self.use_output_allocator_outputs = False + self.output_tensors_are_unowned = False + self.output_allocator: Optional[DynamicOutputAllocator] = None + self.pre_allocated_outputs: List[torch.Tensor] = [] + self._input_buffers: List[torch.Tensor] = [] + self._output_buffers: List[torch.Tensor] = [] + self._caller_stream: Optional[torch.cuda.Stream] = None + self._engine_stream: Optional[torch.cuda.Stream] = None + self.cudagraph: Optional[torch.cuda.CUDAGraph] = None + self.shape_key: Optional[str] = None + self._empty_tensor_placeholder: Optional[torch.Tensor] = None + self._dynamic_workspace: Optional[torch.Tensor] = None + self.runtime_states = TorchTRTRuntimeStates( + torch_tensorrt.runtime.get_cudagraphs_mode() + ) + self.resource_allocation_strategy = 0 + self._runtime_config = None + + self._load_serialized_info(serialized_info) + self._setup_engine() + + def __deepcopy__(self, memo: dict[int, Any]) -> PythonTRTEngine: + """Rebuild from serialized layout so ``copy.deepcopy`` skips unpickleable TRT handles.""" + if id(self) in memo: + return memo[id(self)] # type: ignore + serialized_copy = copy.deepcopy(self.serialized_info, memo) + dup = type(self)(serialized_copy, profile_execution=self._profile_execution) + memo[id(self)] = dup + return dup + + def tracing_mode(self) -> str: + """Return ``"real"`` so FakeTensor/export pass the real engine into meta kernels. + + Mirrors TorchBind ``tracing_with_real`` behavior (see + :func:`torch._library.fake_class_registry.maybe_to_fake_obj`). + """ + + return "real" + + def _load_serialized_info( + self, serialized_info: SerializedTensorRTEngineFmt + ) -> None: + if len(serialized_info) != SERIALIZATION_LEN: + raise RuntimeError( + f"Expected serialized info length {SERIALIZATION_LEN}, got {len(serialized_info)}" + ) + + self.serialized_info: SerializedTensorRTEngineFmt = list(serialized_info) + self.version = str(self.serialized_info[ABI_TARGET_IDX]) + self.name = str(self.serialized_info[NAME_IDX]).replace(".", "_") + self.serialized_device_info = str(self.serialized_info[DEVICE_IDX]) + self.serialized_engine = self.serialized_info[ENGINE_IDX] + if not isinstance(self.serialized_engine, (bytes, bytearray)): + raise TypeError("Expected serialized engine as bytes") + + self.in_binding_names = deserialize_binding_names( + str(self.serialized_info[INPUT_BINDING_NAMES_IDX]) + ) + self.out_binding_names = deserialize_binding_names( + str(self.serialized_info[OUTPUT_BINDING_NAMES_IDX]) + ) + self.hardware_compatible = bool(int(self.serialized_info[HW_COMPATIBLE_IDX])) + self.serialized_metadata = str(self.serialized_info[SERIALIZED_METADATA_IDX]) + self.serialized_target_platform = str(self.serialized_info[TARGET_PLATFORM_IDX]) + self.requires_output_allocator = bool( + int(self.serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) + ) + self.resource_allocation_strategy = int( + self.serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] + ) + + metadata = self.decode_metadata(self.serialized_metadata) + self.settings = metadata.get("settings", CompilationSettings()) + self.weight_name_map = metadata.get("weight_name_map") + self.symbolic_shape_expressions = metadata.get("inout_symexprs") + self.output_tensors_are_unowned = metadata.get( + "output_tensors_are_unowned", False + ) + + device_info = parse_device_info(self.serialized_device_info) + self.target_device_id = device_info["id"] + # Serialized major/minor/name only — not ``_CudaDeviceProperties`` — so deepcopy/refit + # can copy the owning ``GraphModule`` without pickle errors. + self.target_device_properties = SimpleNamespace( + major=device_info["major"], + minor=device_info["minor"], + name=device_info["name"], + ) + + @staticmethod + def decode_metadata(encoded_metadata: str) -> Any: + dumped_metadata = base64.b64decode(encoded_metadata.encode("utf-8")) + return pickle.loads(dumped_metadata) + + def get_serialized_metadata(self) -> str: + return self.serialized_metadata + + def close(self) -> None: + self.reset_captured_graph() + + def _create_execution_context(self) -> trt.IExecutionContext: + strategy = trt.ExecutionContextAllocationStrategy.STATIC + if self.resource_allocation_strategy: + strategy = trt.ExecutionContextAllocationStrategy.USER_MANAGED + context = self.cuda_engine.create_execution_context(strategy) + assert context is not None, "Failed to create execution context" + return context + + def _setup_engine(self) -> None: + multi_gpu_device_check() + self.runtime = trt.Runtime(TRT_LOGGER) + self.cuda_engine = self.runtime.deserialize_cuda_engine(self.serialized_engine) + assert self.cuda_engine is not None, "Failed to deserialize TensorRT engine" + + if self.cuda_engine.streamable_weights_size > 0: + budget_bytes = self.cuda_engine.get_weight_streaming_automatic_budget() + logger.debug(f"Weight streaming budget set to {budget_bytes}B") + self.cuda_engine.weight_streaming_budget_v2 = budget_bytes + + self.context = self._create_execution_context() + + if not self.in_binding_names and not self.out_binding_names: + input_names: List[str] = [] + output_names: List[str] = [] + for idx in range(self.cuda_engine.num_io_tensors): + bind_name = self.cuda_engine.get_tensor_name(idx) + if ( + self.cuda_engine.get_tensor_mode(bind_name) + == trt.TensorIOMode.INPUT + ): + input_names.append(bind_name) + else: + output_names.append(bind_name) + self.in_binding_names = input_names + self.out_binding_names = output_names + + self._input_buffers = [None] * len(self.in_binding_names) + self._output_buffers = [None] * len(self.out_binding_names) + self.input_dtypes = [ + dtype._from(self.cuda_engine.get_tensor_dtype(input_name)).to(torch.dtype) + for input_name in self.in_binding_names + ] + self.output_dtypes = [ + dtype._from(self.cuda_engine.get_tensor_dtype(output_name)).to(torch.dtype) + for output_name in self.out_binding_names + ] + self.output_shapes = [ + self.cuda_engine.get_tensor_shape(output_name) + for output_name in self.out_binding_names + ] + self.is_shape_inference_io = { + input_name: self.cuda_engine.is_shape_inference_io(input_name) + for input_name in self.in_binding_names + } + if self.requires_output_allocator: + self.create_output_allocator() + + # --- weight streaming (mirrors C++ engine surface) --- + + @property + def streamable_device_memory_budget(self) -> Any: + return self.cuda_engine.streamable_weights_size + + @property + def automatic_device_memory_budget(self) -> Any: + return self.cuda_engine.get_weight_streaming_automatic_budget() + + @property + def device_memory_budget(self) -> Any: + return self.cuda_engine.weight_streaming_budget_v2 + + @device_memory_budget.setter + def device_memory_budget(self, budget_bytes: int) -> None: + if budget_bytes < 0: + budget_bytes = self.streamable_device_memory_budget + self.cuda_engine.weight_streaming_budget_v2 = budget_bytes + if self.cuda_engine.weight_streaming_budget_v2 != budget_bytes: + logger.error(f"Failed to set weight streaming budget to {budget_bytes}") + self.context = self._create_execution_context() + self.runtime_states.context_changed = True + + def reset_captured_graph(self) -> None: + if self.cudagraph: + self.cudagraph.reset() + self.cudagraph = None + + def use_dynamically_allocated_resources(self, dynamic: bool = False) -> None: + new_strategy = 1 if dynamic else 0 + if self.resource_allocation_strategy == new_strategy: + return + self.resource_allocation_strategy = new_strategy + self.context = self._create_execution_context() + self.runtime_states.context_changed = True + + def set_output_tensors_as_unowned(self, enabled: bool) -> None: + self.output_tensors_are_unowned = enabled + + def are_output_tensors_unowned(self) -> bool: + return self.output_tensors_are_unowned + + # --- profiling / inspection --- + + def enable_profiling(self) -> None: + if not self.context.profiler: + self.context.profiler = trt.Profiler() + self._profile_execution = True + + def set_profile_format(self, profile_format: str) -> None: + if profile_format not in ["cudagraph", "trex", "perfetto"]: + raise ValueError(f"Invalid profile format: {profile_format}") + + def disable_profiling(self) -> None: + torch.cuda.synchronize() + self.context = self._create_execution_context() + self._profile_execution = False + self.runtime_states.context_changed = True + + def get_engine_layer_info(self) -> str: + inspector = self.cuda_engine.create_engine_inspector() + return str(inspector.get_engine_information(trt.LayerInformationFormat.JSON)) + + def dump_engine_layer_info(self) -> None: + print(self.get_engine_layer_info()) + + # --- tensor binding helpers --- + + def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: + new_shape_key = "".join(str(tuple(t.shape)).replace(" ", "") for t in inputs) + if new_shape_key != self.shape_key: + logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}") + self.shape_key = new_shape_key + return True + return False + + def create_output_allocator(self) -> None: + if self.output_allocator is None: + self.output_allocator = DynamicOutputAllocator( + { + name: self.output_dtypes[idx] + for idx, name in enumerate(self.out_binding_names) + } + ) + + def create_output_tensors(self) -> List[torch.Tensor]: + return [ + torch.empty( + size=self.output_shapes[idx], + dtype=self.output_dtypes[idx], + device=torch.cuda.current_device(), + ) + for idx, _ in enumerate(self.out_binding_names) + ] + + def setup_input_tensors( + self, + contiguous_inputs: List[torch.Tensor], + cudagraphs_enabled: bool, + need_cudagraphs_record: bool, + ) -> None: + for i, input_name in enumerate(self.in_binding_names): + if not contiguous_inputs[i].is_cuda: + logger.warning( + f"Detected input {input_name} of engine {self.name} is not on a cuda device. " + "This tensor is being moved by the runtime but for performance considerations, " + "ensure your inputs are all on GPU and open an issue here " + "(https://github.com/pytorch/TensorRT/issues) if this warning persists." + ) + contiguous_inputs[i] = contiguous_inputs[i].cuda() + + assert ( + contiguous_inputs[i].dtype == self.input_dtypes[i] + ), f"Dtype mismatch for input {input_name}. Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." + + if need_cudagraphs_record: + self._input_buffers[i] = contiguous_inputs[i].clone() + + if self.is_shape_inference_io[input_name]: + inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() + self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) + else: + self.context.set_input_shape( + input_name, tuple(contiguous_inputs[i].shape) + ) + tensor_to_bind = contiguous_inputs[i] + if tensor_to_bind.numel() == 0: + if self._empty_tensor_placeholder is None: + self._empty_tensor_placeholder = torch.empty( + 1, + dtype=tensor_to_bind.dtype, + device=torch.cuda.current_device(), + ) + tensor_to_bind = self._empty_tensor_placeholder + + if cudagraphs_enabled: + self._input_buffers[i].copy_(contiguous_inputs[i]) + self.context.set_tensor_address( + input_name, self._input_buffers[i].data_ptr() + ) + else: + self.context.set_tensor_address( + input_name, tensor_to_bind.data_ptr() + ) + + def _profile_section(self, label: str) -> ContextManager[None]: + if self._profile_execution: + return cast( + ContextManager[None], + torch.autograd.profiler.record_function(label), + ) + return nullcontext() + + # --- execution --- + + def _execute_standard( + self, contiguous_inputs: List[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + shape_changed = self.validate_input_shapes(contiguous_inputs) + ( + need_cudagraphs_record, + can_use_pre_allocated_outputs, + need_cudagraphs_reset, + ) = self.runtime_states.set_runtime_states( + torch_tensorrt.runtime.get_cudagraphs_mode(), + self.use_pre_allocated_outputs, + shape_changed, + ) + + if need_cudagraphs_reset: + self.reset_captured_graph() + + if need_cudagraphs_record: + self._input_buffers = [None] * len(self.in_binding_names) + self._output_buffers = [None] * len(self.out_binding_names) + + with self._profile_section("PythonTRTEngine:ProcessInputs"): + self.setup_input_tensors( + contiguous_inputs, + torch_tensorrt.runtime.get_cudagraphs_mode(), + need_cudagraphs_record, + ) + if shape_changed: + uninferred_input_names = self.context.infer_shapes() + if uninferred_input_names: + logger.warning( + f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior." + ) + + with self._profile_section("PythonTRTEngine:ProcessOutputs"): + if can_use_pre_allocated_outputs: + outputs = self.pre_allocated_outputs + else: + self.output_shapes = [ + tuple(self.context.get_tensor_shape(output_name)) + for output_name in self.out_binding_names + ] + if any(-1 in shape for shape in self.output_shapes): + raise ValueError( + "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." + ) + outputs = self.create_output_tensors() + + for o, output_name in enumerate(self.out_binding_names): + if need_cudagraphs_record: + self._output_buffers[o] = outputs[o].clone() + if torch_tensorrt.runtime.get_cudagraphs_mode(): + self.context.set_tensor_address( + output_name, self._output_buffers[o].data_ptr() + ) + else: + self.context.set_tensor_address(output_name, outputs[o].data_ptr()) + + with self._profile_section("PythonTRTEngine:TensorRTRuntime"): + self._caller_stream = torch.cuda.current_stream() + if ( + self._engine_stream == torch.cuda.default_stream() + or self._engine_stream is None + ): + self._engine_stream = torch.cuda.Stream() + + self._engine_stream.wait_stream(self._caller_stream) + with torch.cuda.stream(self._engine_stream): + if self.resource_allocation_strategy: + self._dynamic_workspace = torch.empty( + self.cuda_engine.device_memory_size_v2, + dtype=torch.uint8, + device=torch.cuda.current_device(), + ) + self.context.set_device_memory(self._dynamic_workspace.data_ptr()) + + if torch_tensorrt.runtime.get_cudagraphs_mode(): + if need_cudagraphs_record: + self.cudagraph = torch.cuda.CUDAGraph() + if self._profile_execution: + self.cudagraph.enable_debug_mode() + with torch.cuda.graph( + self.cudagraph, stream=self._engine_stream + ): + self.context.execute_async_v3( + self._engine_stream.cuda_stream + ) + if self._profile_execution: + self.cudagraph.debug_dump( + f"{DEBUG_LOGGING_DIR}/{self.name}_cudagraph.dot" + ) + self.cudagraph.replay() # type: ignore[union-attr] + else: + self.context.execute_async_v3(self._engine_stream.cuda_stream) + + self._caller_stream.wait_stream(self._engine_stream) + + if self.use_pre_allocated_outputs and ( + self.output_tensors_are_unowned + or not self.pre_allocated_outputs + or shape_changed + ): + self.pre_allocated_outputs = self.create_output_tensors() + + if torch_tensorrt.runtime.get_cudagraphs_mode(): + for idx, output in enumerate(outputs): + output.copy_(self._output_buffers[idx]) + + if len(outputs) == 1: + return outputs[0] + return tuple(outputs) + + def _execute_output_allocator( + self, contiguous_inputs: List[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + if torch_tensorrt.runtime.get_cudagraphs_mode(): + raise RuntimeError( + "Both CUDA Graphs and dynamic output allocation are enabled, which are " + "incompatible runtime modes. Please disable one of the two." + ) + + with self._profile_section("PythonTRTEngine:ProcessInputs"): + self.setup_input_tensors(contiguous_inputs, False, False) + + with self._profile_section("PythonTRTEngine:SetupOutputAllocator"): + self.create_output_allocator() + for output_name in self.out_binding_names: + if not self.context.set_output_allocator( + output_name, self.output_allocator + ): + raise RuntimeError( + f"Failed to set output allocator for {output_name}" + ) + + with self._profile_section("PythonTRTEngine:TensorRTRuntime"): + self._caller_stream = torch.cuda.current_stream() + if ( + self._engine_stream == torch.cuda.default_stream() + or self._engine_stream is None + ): + self._engine_stream = torch.cuda.Stream() + + self._engine_stream.wait_stream(self._caller_stream) + with torch.cuda.stream(self._engine_stream): + self.context.execute_async_v3(self._engine_stream.cuda_stream) + self._caller_stream.wait_stream(self._engine_stream) + + outputs = [] + assert self.output_allocator is not None + for idx, output_name in enumerate(self.out_binding_names): + shape = self.output_allocator.shapes.get(output_name, None) + dtype_ = self.output_dtypes[idx] + buffer_tensor = self.output_allocator.buffers.get(output_name) + assert buffer_tensor is not None + output = buffer_tensor.clone().detach() + prod = int(torch.prod(torch.tensor(shape))) + output = output.reshape(-1).view(dtype_)[:prod].reshape(shape) + outputs.append(output) + + if len(outputs) == 1: + return outputs[0] + return tuple(outputs) + + def execute( + self, inputs: Sequence[torch.Tensor] + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + contiguous_inputs = [tensor.contiguous() for tensor in inputs] + + if torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE: + curr_device_id = torch.cuda.current_device() + curr_device_properties = torch.cuda.get_device_properties(curr_device_id) + if _is_switch_required( + curr_device_id, + self.target_device_id, + curr_device_properties, + self.target_device_properties, + ): + device_id, _ = _select_rt_device( + curr_device_id, + self.target_device_id, + self.target_device_properties, + ) + device = torch.device(device_id) + torch.cuda.set_device(device_id) + contiguous_inputs = [tensor.to(device) for tensor in contiguous_inputs] + logger.warning(f"Moved all input Tensors to cuda:{device_id}") + + if self.requires_output_allocator or self.use_output_allocator_outputs: + logger.debug("Using the dynamic allocator runtime mode.") + return self._execute_output_allocator(contiguous_inputs) + + logger.debug( + f"Using the standard execution runtime mode with cudagraphs={torch_tensorrt.runtime.get_cudagraphs_mode()}." + ) + return self._execute_standard(contiguous_inputs) + + +register_opaque_type(PythonTRTEngine, typ="reference") + + +@torch.library.custom_op( # type: ignore[misc] + "tensorrt::execute_engine_python", mutates_args=() +) +def execute_engine_python( + input_tensors: List[torch.Tensor], engine: PythonTRTEngine +) -> List[torch.Tensor]: + outputs = engine.execute(input_tensors) + return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py deleted file mode 100644 index 31182bbe21..0000000000 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ /dev/null @@ -1,813 +0,0 @@ -from __future__ import annotations - -import logging -from contextlib import nullcontext -from typing import Any, Dict, List, Optional, Sequence, Tuple - -import torch -import torch_tensorrt -from torch.nn import Module -from torch_tensorrt._Device import Device -from torch_tensorrt._enums import Platform, dtype -from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR -from torch_tensorrt.dynamo._settings import CompilationSettings -from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig -from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger -from torch_tensorrt.dynamo.utils import DYNAMIC_DIM -from torch_tensorrt.logging import TRT_LOGGER -from torch_tensorrt.runtime._utils import ( - _is_switch_required, - _select_rt_device, - multi_gpu_device_check, -) - -import tensorrt as trt - -logger = logging.getLogger(__name__) - - -class DynamicOutputAllocator(trt.IOutputAllocator): # type: ignore[misc] - def __init__(self, output_dtypes: Dict[str, torch.dtype]) -> None: - trt.IOutputAllocator.__init__(self) - self.buffers: Dict[str, torch.Tensor] = {} - self.shapes: Dict[str, Tuple[int, ...]] = {} - self.dtypes: Dict[str, torch.dtype] = output_dtypes - - def reallocate_output_async( - self, - tensor_name: str, - memory: int, - size: int, - alignment: int, - stream: torch.cuda.Stream, - ) -> Any: - shape = (size,) - if tensor_name not in self.buffers: - self.buffers[tensor_name] = torch.empty( - shape, - dtype=self.dtypes[tensor_name], - device=torch.cuda.current_device(), - ) - else: - if self.buffers[tensor_name].shape != shape: - self.buffers[tensor_name] = torch.empty( - shape, - dtype=self.dtypes[tensor_name], - device=torch.cuda.current_device(), - ) - return self.buffers[tensor_name].data_ptr() - - def notify_shape(self, tensor_name: str, shape: Tuple[int, ...]) -> None: - self.shapes[tensor_name] = tuple(shape) - - -class TorchTRTRuntimeStates: - def __init__(self, new_cudagraphs: bool): - # Indicates whether CUDAGraphs were enabled in the previous execute_engine - self.old_cudagraphs = new_cudagraphs - # Indicates whether pre-allocated output was enabled in the previous execute_engine - self.old_pre_allocated_outputs = False - # Indicates whether context has changed - self.context_changed = False - - def set_runtime_states( - self, - new_cudagraphs: bool, - new_pre_allocated_output: bool, - shape_changed: bool, - ) -> Tuple[bool, bool, bool]: - # Evaluates whether certain conditions are met to enable CUDA Graph recording or to use pre-allocated outputs - # based on the current and previous states, as well as input shape has changed - need_cudagraphs_record = False - can_use_pre_allocated_outputs = False - need_cudagraphs_reset = False - - # CUDA Graph recording is needed if CUDA graphs is enabled and: - # - CUDA graphs were previously disabled - # - or the shape has changed - # - or the execution context has changed (e.g., weight streaming) - if new_cudagraphs and ( - not self.old_cudagraphs or shape_changed or self.context_changed - ): - need_cudagraphs_record = True - - # Pre-allocated output can be used when previous and current state are true without shape change - if ( - self.old_pre_allocated_outputs - and new_pre_allocated_output - and (not shape_changed) - ): - can_use_pre_allocated_outputs = True - - if not new_cudagraphs or shape_changed or self.context_changed: - need_cudagraphs_reset = True - - self.old_cudagraphs = new_cudagraphs - self.old_pre_allocated_outputs = new_pre_allocated_output - # reset flag - self.context_changed = False - - return ( - need_cudagraphs_record, - can_use_pre_allocated_outputs, - need_cudagraphs_reset, - ) - - -@cls_supports_debugger -class PythonTorchTensorRTModule(Module): # type: ignore[misc] - """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - - This module is backed by the Torch-TensorRT runtime and is only compatible with - FX / Dynamo / Python deployments. This module cannot be serialized to torchscript via torch.jit.trace for C++ deployment. - """ - - def __init__( - self, - serialized_engine: Optional[bytes] = None, - input_binding_names: Optional[List[str]] = None, - output_binding_names: Optional[List[str]] = None, - *, - name: str = "", - settings: CompilationSettings = CompilationSettings(), - weight_name_map: Optional[dict[Any, Any]] = None, - requires_output_allocator: bool = False, - symbolic_shape_expressions: Optional[Dict[str, List[Dict[str, Any]]]] = None, - _debugger_config: Optional[DebuggerConfig] = None, - ): - """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs - a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine - - Arguments: - serialized_engine (bytes): Serialized TensorRT engine in the form of a bytearray - input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules - output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned - - Keyword Arguments: - name (str): Name for module - settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed - weight_name_map (dict): Mapping of engine weight name to state_dict weight name - requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators) - symbolic_shape_expressions (List[str]): List of symbolic shape expressions for each output binding - - Example: - - .. code-block:: py - - trt_module = PythonTorchTensorRTModule( - engine_str, - input_binding_names=["x"], - output_binding_names=["output"], - name="my_module", - settings=CompilationSettings(device=torch.cuda.current_device) - ) - - """ - self.context: Any - self._debugger_config: Optional[DebuggerConfig] = _debugger_config - super(PythonTorchTensorRTModule, self).__init__() - self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict) - - # Run multi-gpu device check to validate engine instantiation - multi_gpu_device_check() - - self.name = name - self._input_buffers: List[torch.Tensor] = [] - self._output_buffers: List[torch.Tensor] = [] - self.cudagraph: Optional[torch.cuda.CUDAGraph] = None - self._caller_stream: Optional[torch.cuda.Stream] = None - self._engine_stream: Optional[torch.cuda.Stream] = None - - # TODO: Make the below a Dictionary {shape: cudagraph} - self.shape_key: Optional[str] = None - - # See https://github.com/pytorch/pytorch/blob/acfe237a71af609e837a34bb38048aa8acb8eb4d/torch/cuda/graphs.py#L92-L98 - # Unused currently - to be used by Dynamic Shape support implementation - self.memory_pool = None - - self.serialized_engine = serialized_engine - self.input_names = ( - input_binding_names if input_binding_names is not None else [] - ) - self.output_names = ( - output_binding_names if output_binding_names is not None else [] - ) - self.initialized = False - self.target_device_id = ( - settings.device.gpu_id - if settings.device is not None - else Device._current_device().gpu_id - ) - self.target_device_properties = torch.cuda.get_device_properties( - self.target_device_id - ) - self.profiling_enabled = ( - _debugger_config.save_engine_profile - if _debugger_config is not None - else False - ) - self.settings = settings - self.engine = None - self.weight_name_map = weight_name_map - self.target_platform = Platform.current_platform() - self.runtime_states = TorchTRTRuntimeStates( - torch_tensorrt.runtime.get_cudagraphs_mode() - ) - - self.cudagraphs_enabled = False - self.pre_allocated_outputs: List[torch.Tensor] = [] - self.use_pre_allocated_outputs = False - - self.requires_output_allocator = requires_output_allocator - self.output_allocator: Optional[DynamicOutputAllocator] = None - self.use_output_allocator_outputs = False - self.device = torch.cuda.current_device() - self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - # If the output tensor is not owned by the engine (output_tensors_are_unowned=True), we need to create a new output tensor in each forward pass - self.output_tensors_are_unowned = False - self.symbolic_shape_expressions = symbolic_shape_expressions - if self.serialized_engine is not None and not self.settings.lazy_engine_init: - self.setup_engine() - - def set_output_tensors_as_unowned(self, enabled: bool) -> None: - """ - Flag to set if the output tensors of this engine are solely owned by the Torch-TensorRT Runtime or if they might be shared with a user. - If the tensors are not owned by the runtime, then they must be recreated on every forward call which may have implications for performance. - Typically only the final engine in a graph requires output tensors to be unowned and there are performance gains to be had for intermediate engines to manage their own standing memory. - Therefore this should only be set to True for the final module in a graph and leave false for intermediate modules. - - Args: - enabled: bool - Whether to set the flag to True. - - """ - self.output_tensors_are_unowned = enabled - - def get_streamable_device_memory_budget(self) -> Any: - return self.engine.streamable_weights_size - - def get_automatic_device_memory_budget(self) -> Any: - return self.engine.get_weight_streaming_automatic_budget() - - def get_device_memory_budget(self) -> Any: - return self.engine.weight_streaming_budget_v2 - - def set_device_memory_budget(self, budget_bytes: int) -> int: - # Recreating the context because weight streaming budget cannot be modified while there are active context. - if self.context is not None: - del self.context - budget_bytes = self._set_device_memory_budget(budget_bytes) - self.context = self.engine.create_execution_context() - self.runtime_states.context_changed = True - return budget_bytes - - def _set_device_memory_budget(self, budget_bytes: int) -> int: - # Disable weight streaming for invalid budget size - if budget_bytes < 0: - budget_bytes = self.get_streamable_device_memory_budget() - self.engine.weight_streaming_budget_v2 = budget_bytes - if self.engine.weight_streaming_budget_v2 != budget_bytes: - logger.error(f"Failed to set weight streaming budget to {budget_bytes}") - budget_bytes = self.engine.weight_streaming_budget_v2 - if self.get_streamable_device_memory_budget() == budget_bytes: - logger.warning("Weight streaming is disabled") - - return budget_bytes - - def set_default_device_memory_budget(self) -> int: - budget_bytes = self.get_automatic_device_memory_budget() - # Set automatic weight streaming budget as default when context is created - logger.debug(f"Weight streaming budget set to {budget_bytes}B") - return self._set_device_memory_budget(budget_bytes) - - def setup_engine(self) -> None: - assert ( - self.target_platform == Platform.current_platform() - ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})" - - self.initialized = True - runtime = trt.Runtime(TRT_LOGGER) - self.engine = runtime.deserialize_cuda_engine(self.serialized_engine) - if self.settings.enable_weight_streaming: - self.set_default_device_memory_budget() - self.context = self.engine.create_execution_context() - assert self.context is not None, "Failed to create execution context" - assert self.engine.num_io_tensors == ( - len(self.input_names) + len(self.output_names) - ) - - self.input_dtypes = [ - dtype._from(self.engine.get_tensor_dtype(input_name)) - for input_name in self.input_names - ] - - self.input_shapes = [ - self.engine.get_tensor_shape(input_name) for input_name in self.input_names - ] - self.output_dtypes = [ - dtype._from(self.engine.get_tensor_dtype(output_name)).to(torch.dtype) - for output_name in self.output_names - ] - self.output_shapes = [ - self.engine.get_tensor_shape(output_name) - for output_name in self.output_names - ] - - if self.requires_output_allocator: - self.create_output_allocator() - - if torch_tensorrt.runtime.get_cudagraphs_mode(): - self.cudagraph = torch.cuda.CUDAGraph() - - self.is_shape_inference_io = { - input_name: self.engine.is_shape_inference_io(input_name) - for input_name in self.input_names - } - - def _check_initialized(self) -> None: - if not self.initialized: - raise RuntimeError("PythonTorchTensorRTModule is not initialized.") - - def _on_state_dict(self, state_dict: Dict[str, Any], prefix: str, _: Any) -> None: - state_dict[prefix + "engine"] = self.serialized_engine - state_dict[prefix + "input_names"] = self.input_names - state_dict[prefix + "output_names"] = self.output_names - state_dict[prefix + "platform"] = self.target_platform - - def _load_from_state_dict( - self, - state_dict: Dict[str, Any], - prefix: str, - local_metadata: Any, - strict: Any, - missing_keys: Any, - unexpected_keys: Any, - error_msgs: Any, - ) -> None: - self.serialized_engine = state_dict[prefix + "engine"] - self.input_names = state_dict[prefix + "input_names"] - self.output_names = state_dict[prefix + "output_names"] - self.target_platform = state_dict[prefix + "platform"] - - # Run multi-gpu device check to validate engine instantiation - multi_gpu_device_check() - self.setup_engine() - - def __getstate__(self) -> Dict[str, Any]: - state = self.__dict__.copy() - state.pop("engine", None) - state.pop("context", None) - return state - - def __setstate__(self, state: Dict[str, Any]) -> None: - self.__dict__.update(state) - self.setup_engine() - - def __deepcopy__(self, memo: Any) -> PythonTorchTensorRTModule: - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - result.__setstate__(self.__getstate__()) - return result - - def _reset_captured_graph(self) -> None: - if self.cudagraph: - self.cudagraph.reset() - self.cudagraph = None - - def __del__(self) -> None: - self._reset_captured_graph() - - def setup_input_tensors( - self, - contiguous_inputs: List[torch.Tensor], - cudagraphs_enabled: bool, - need_cudagraphs_record: bool, - ) -> None: - for i, input_name in enumerate(self.input_names): - if not contiguous_inputs[i].is_cuda: - logger.warning( - f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. " - "This tensor is being moved by the runtime but for performance considerations, " - "ensure your inputs are all on GPU and open an issue here " - "(https://github.com/pytorch/TensorRT/issues) if this warning persists." - ) - contiguous_inputs = ( - contiguous_inputs[:i] - + [contiguous_inputs[i].cuda()] - + contiguous_inputs[i + 1 :] - ) - - assert ( - contiguous_inputs[i].dtype == self.input_dtypes[i] - ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." - - if need_cudagraphs_record: - # If cudagraphs is enabled, this memory is reserved for future cudagraph runs - # Clone is required to avoid re-using user-provided GPU memory - self._input_buffers[i] = contiguous_inputs[i].clone() - - # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers - # as per TensorRT requirements - if self.is_shape_inference_io[input_name]: - # Shape tensor inputs are casted to int64 explicitly - # Currently Torch CPU pointers are not working; numpy pointers are used instead - # to refer to underlying memory - inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() - self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) - else: - self.context.set_input_shape( - input_name, tuple(contiguous_inputs[i].shape) - ) - tensor_to_bind = contiguous_inputs[i] - if tensor_to_bind.numel() == 0: - # Use a single persistent placeholder for empty tensors (allocated once, reused) - if not hasattr(self, "_empty_tensor_placeholder"): - self._empty_tensor_placeholder = torch.empty( - 1, - dtype=tensor_to_bind.dtype, - device=torch.cuda.current_device(), - ) - tensor_to_bind = self._empty_tensor_placeholder - if cudagraphs_enabled: - self._input_buffers[i].copy_(contiguous_inputs[i]) - self.context.set_tensor_address( - input_name, self._input_buffers[i].data_ptr() - ) - else: - self.context.set_tensor_address( - input_name, tensor_to_bind.data_ptr() - ) - - def create_output_tensors(self) -> List[torch.Tensor]: - # create output tensors - outputs: List[torch.Tensor] = [] - - for o, _ in enumerate(self.output_names): - output = torch.empty( - size=self.output_shapes[o], - dtype=self.output_dtypes[o], - device=self.device, - ) - outputs.append(output) - return outputs - - def set_pre_allocated_outputs(self, enable: bool) -> None: - self.use_pre_allocated_outputs = enable - - def set_use_output_allocator(self, enable: bool) -> None: - self.use_output_allocator_outputs = enable - - def create_output_allocator(self) -> None: - if self.output_allocator is None: - output_dtypes_dict = {} - for o, output_name in enumerate(self.output_names): - output_dtypes_dict[output_name] = self.output_dtypes[o] - self.output_allocator = DynamicOutputAllocator(output_dtypes_dict) - - def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]: - def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: - shape_changed = self.validate_input_shapes(contiguous_inputs) - ( - need_cudagraphs_record, - can_use_pre_allocated_outputs, - need_cudagraphs_reset, - ) = self.runtime_states.set_runtime_states( - self.cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed - ) - - if need_cudagraphs_reset: - self._reset_captured_graph() - - if need_cudagraphs_record: - self._input_buffers = [None] * len(self.input_names) - self._output_buffers = [None] * len(self.output_names) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessInputs" - ) - if self.profiling_enabled - else nullcontext() - ): - assert len(contiguous_inputs) == len( - self.input_names - ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." - - self.setup_input_tensors( - contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record - ) - - if shape_changed: - # Check if input shapes can be inferred. - uninferred_input_names = self.context.infer_shapes() - if uninferred_input_names: - logger.warning( - f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \ - This could happen if the input tensor addresses/shapes haven't been configured correctly" - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessOutputs" - ) - if self.profiling_enabled - else nullcontext() - ): - if can_use_pre_allocated_outputs: - outputs = self.pre_allocated_outputs - else: - self.output_shapes = [ - tuple(self.context.get_tensor_shape(output_name)) - for output_name in self.output_names - ] - if DYNAMIC_DIM in self.output_shapes: - raise ValueError( - "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." - ) - outputs = self.create_output_tensors() - - for o, output_name in enumerate(self.output_names): - if need_cudagraphs_record: - self._output_buffers[o] = outputs[o].clone() - - if self.cudagraphs_enabled: - self.context.set_tensor_address( - output_name, self._output_buffers[o].data_ptr() - ) - else: - self.context.set_tensor_address( - output_name, outputs[o].data_ptr() - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:TensorRTRuntime" - ) - if self.profiling_enabled - else nullcontext() - ): - self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): - self._engine_stream = torch.cuda.Stream() - - self._engine_stream.wait_stream(self._caller_stream) - - with torch.cuda.stream(self._engine_stream): - if self.cudagraphs_enabled: - if need_cudagraphs_record: - self.cudagraph = torch.cuda.CUDAGraph() - - if self.profiling_enabled: - self.cudagraph.enable_debug_mode() - - with torch.cuda.graph( - self.cudagraph, stream=self._engine_stream - ): - self.context.execute_async_v3( - self._engine_stream.cuda_stream - ) - - if self.profiling_enabled: - self.cudagraph.debug_dump( - f"{DEBUG_LOGGING_DIR}/{self.name}_cudagraph.dot" - ) - - self.cudagraph.replay() # type: ignore - - else: - self.context.execute_async_v3(self._engine_stream.cuda_stream) - - self._caller_stream.wait_stream(self._engine_stream) - - # When the pre-allocated output mode is turned on, for intermediate modules, we only create the output in the first execution or when shape is changed. - if self.use_pre_allocated_outputs and ( - self.output_tensors_are_unowned - or not self.pre_allocated_outputs - or shape_changed - ): - self.pre_allocated_outputs = self.create_output_tensors() - - if self.cudagraphs_enabled: - for idx, o in enumerate(outputs): - o.copy_(self._output_buffers[idx]) - - if len(outputs) == 1: - return outputs[0] - - return outputs - - def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]: - assert ( - not torch_tensorrt.runtime.get_cudagraphs_mode() - ), "CUDA Graphs are not compatible with OutputAllocator." - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessInputs" - ) - if self.profiling_enabled - else nullcontext() - ): - assert len(contiguous_inputs) == len( - self.input_names - ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." - - self.setup_input_tensors(contiguous_inputs, False, False) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:SetupOutputAllocator" - ) - if self.profiling_enabled - else nullcontext() - ): - self.create_output_allocator() - # need to set output allocator every run - for output_name in self.output_names: - if not self.context.set_output_allocator( - output_name, self.output_allocator - ): - raise RuntimeError( - f"Failed to set output allocator for {output_name}" - ) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:TensorRTRuntime" - ) - if self.profiling_enabled - else nullcontext() - ): - self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): - self._engine_stream = torch.cuda.Stream() - - self._engine_stream.wait_stream(self._caller_stream) - - with torch.cuda.stream(self._engine_stream): - self.context.execute_async_v3( - self._engine_stream.cuda_stream - ) # The OutputAllocator is called by execute_async_v3() - - self._caller_stream.wait_stream(self._engine_stream) - - with ( - torch.autograd.profiler.record_function( - "PythonTorchTensorRTModule:ProcessOutputs" - ) - if self.profiling_enabled - else nullcontext() - ): - outputs = [] - assert self.output_allocator is not None - for o, output_name in enumerate(self.output_names): - shape = self.output_allocator.shapes.get(output_name, None) - dtype = self.output_dtypes[o] - output = ( - self.output_allocator.buffers.get(output_name, None) - .clone() - .detach() - ) - prod = int(torch.prod(torch.tensor(shape))) - # When using the OutputAllocator, the allocated buffer might be larger than the size of the output, - # so we need to reshape the buffer to the output shape - output = output.reshape(-1).view(dtype)[:prod].reshape(shape) - outputs.append(output) - - if len(outputs) == 1: - return outputs[0] - - return outputs - - self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - - # Run forward function - contiguous_inputs: List[torch.Tensor] = [ - (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) - for i in inputs - ] - with ( - torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward") - if self.profiling_enabled - else nullcontext() - ): - self._check_initialized() - - # If in safe mode, check at each iteration for whether a switch is required - if ( - torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE - ): - curr_device_id = torch.cuda.current_device() - curr_device_properties = torch.cuda.get_device_properties( - curr_device_id - ) - logger.debug(f"Current Device: cuda:{curr_device_id}") - - # If a switch is required, move all inputs to new device and set as active device - if _is_switch_required( - curr_device_id, - self.target_device_id, - curr_device_properties, - self.target_device_properties, - ): - device_id, _ = _select_rt_device( - curr_device_id, - self.target_device_id, - self.target_device_properties, - ) - - # Update current device - device = torch.device(device_id) - torch.cuda.set_device(device_id) - - contiguous_inputs = [ - tensor.to(device) for tensor in contiguous_inputs - ] - logger.warning(f"Moved all input Tensors to cuda:{device_id}") - - if self.requires_output_allocator: # engine requires OA - if self.cudagraphs_enabled: - raise RuntimeError( - "The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs." - ) - logger.debug("Using the dynamic allocator runtime mode.") - return run_output_allocator() - else: - if self.use_output_allocator_outputs: # users call OA context manager - if self.cudagraphs_enabled: - raise RuntimeError( - "Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two." - ) - logger.debug("Using the dynamic allocator runtime mode.") - return run_output_allocator() - else: - logger.debug( - f"Using the standard execution runtime mode with cudagraphs={self.cudagraphs_enabled}." - ) - return run_standard_execution() - - def enable_profiling(self, profiler: "trt.IProfiler" = None) -> None: - """ - Enable TensorRT profiling. After calling this function, TensorRT will report - time spent on each layer in stdout for each forward run. - """ - self._check_initialized() - - if not self.context.profiler: - self.context.profiler = trt.Profiler() if profiler is None else profiler - - self.profiling_enabled = True - - def disable_profiling(self) -> None: - """ - Disable TensorRT profiling. - """ - self._check_initialized() - torch.cuda.synchronize() - del self.context - self.context = self.engine.create_execution_context() - self.profiling_enabled = False - - def get_layer_info(self) -> str: - """ - Get layer info of the engine. Only support for TRT > 8.2. - """ - inspector = self.engine.create_engine_inspector() - engine_json: str = inspector.get_engine_information( - trt.LayerInformationFormat.JSON - ) - return engine_json - - def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: - """ - Validates the input shapes of the forward function has changed - """ - # Representation of input shapes to a given model - # Shapes are concatenated as so: - # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5) - if not all(isinstance(t, torch.Tensor) for t in inputs): - return True - - new_shape_key = "".join( - str(tuple(t.shape)).replace(" ", "") - for t in inputs - if isinstance(t, torch.Tensor) - ) - - # If the new shape key differs from the existing one, - # invalidate the old shape key and remove the CUDAGraph - if new_shape_key != self.shape_key: - logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}") - self.shape_key = new_shape_key - return True - - return False - - def are_output_tensors_unowned(self) -> bool: - return self.output_tensors_are_unowned diff --git a/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py b/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py new file mode 100644 index 0000000000..57a0849732 --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import logging +from enum import Enum +from typing import Union + +import torch_tensorrt + +logger = logging.getLogger(__name__) + + +class RuntimeBackend(str, Enum): + """Which Torch-TensorRT engine execution stack to use.""" + + CPP = "cpp" + PYTHON = "python" + + +_RuntimeBackendArg = Union[RuntimeBackend, str] + + +def _default_runtime_backend() -> RuntimeBackend: + return ( + RuntimeBackend.CPP + if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime + else RuntimeBackend.PYTHON + ) + + +_RUNTIME_BACKEND: RuntimeBackend = _default_runtime_backend() + + +def _normalize_runtime_backend(backend: _RuntimeBackendArg) -> RuntimeBackend: + if isinstance(backend, RuntimeBackend): + if ( + backend is RuntimeBackend.CPP + and not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime + ): + raise RuntimeError( + "C++ Torch-TensorRT runtime is not available in this build" + ) + return backend + + normalized = backend.lower() + if normalized not in ("cpp", "python"): + raise ValueError(f"Unsupported runtime backend: {backend}") + member = RuntimeBackend(normalized) + if ( + member is RuntimeBackend.CPP + and not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime + ): + raise RuntimeError("C++ Torch-TensorRT runtime is not available in this build") + return member + + +def get_runtime_backend() -> RuntimeBackend: + """Return the process-wide default backend (``cpp`` or ``python``).""" + return _RUNTIME_BACKEND + + +class _RuntimeBackendContextManager: + def __init__(self, old_backend: RuntimeBackend) -> None: + self.old_backend = old_backend + + def __enter__(self) -> "_RuntimeBackendContextManager": + return self + + def __exit__(self, *args: object) -> None: + global _RUNTIME_BACKEND + _RUNTIME_BACKEND = self.old_backend + + +def set_runtime_backend(backend: _RuntimeBackendArg) -> _RuntimeBackendContextManager: + """Context manager: set global C++ vs Python engine path for unpinned modules. + + Use around compile and forward so :class:`~torch_tensorrt.runtime.TorchTensorRTModule` + picks up the intended backend when it is constructed: + + .. code-block:: python + + with torch_tensorrt.runtime.set_runtime_backend("python"): + trt_gm = torch_tensorrt.dynamo.compile(...) + + If the return value is not used with ``with``, the backend remains changed until you + call ``__exit__`` on the returned object (or enter another ``set_runtime_backend`` context). + """ + global _RUNTIME_BACKEND + old_backend = _RUNTIME_BACKEND + _RUNTIME_BACKEND = _normalize_runtime_backend(backend) + logger.info(f"Set Torch-TensorRT runtime backend to {_RUNTIME_BACKEND}") + return _RuntimeBackendContextManager(old_backend) diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index d77c0bf39f..ffb37e15ac 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -4,77 +4,71 @@ import copy import logging import pickle -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, cast import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import Platform -from torch_tensorrt._features import ( - ENABLED_FEATURES, - for_all_methods, - needs_torch_tensorrt_runtime, -) +from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.runtime._PythonTRTEngine import PythonTRTEngine +from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( + RuntimeBackend, + _normalize_runtime_backend, + get_runtime_backend, +) +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + ABI_TARGET_IDX, + ABI_VERSION, + DEVICE_IDX, + ENGINE_IDX, + HW_COMPATIBLE_IDX, + INPUT_BINDING_NAMES_IDX, + NAME_IDX, + OUTPUT_BINDING_NAMES_IDX, + REQUIRES_OUTPUT_ALLOCATOR_IDX, + RESOURCE_ALLOCATION_STRATEGY_IDX, + SERIALIZATION_LEN, + SERIALIZED_METADATA_IDX, + TARGET_PLATFORM_IDX, + SerializedTensorRTEngineFmt, + serialize_binding_names, + serialize_device_info, +) logger = logging.getLogger(__name__) -SerializedTensorRTEngineFmt = List[ - Union[str, bytes] -] # Aligned with //core/runtime/register_jit_hooks.cpp SerializedTorchTensorRTModuleFmt = Tuple[ + str, + Optional[SerializedTensorRTEngineFmt], + List[str], + List[str], + Optional[str], +] +# Checkpoints written before the trailing ``runtime_backend`` slot used four elements. +_LegacyTorchTensorRTModuleExtraState = Tuple[ str, Optional[SerializedTensorRTEngineFmt], List[str], List[str] ] +TorchTensorRTModuleExtraState = Union[ + SerializedTorchTensorRTModuleFmt, + _LegacyTorchTensorRTModuleExtraState, +] -ABI_TARGET_IDX = -1 # Not implemented -NAME_IDX = -1 # Not implemented -DEVICE_IDX = -1 # Not implemented -ENGINE_IDX = -1 # Not implemented -INPUT_BINDING_NAMES_IDX = -1 # Not implemented -OUTPUT_BINDING_NAMES_IDX = -1 # Not implemented -HW_COMPATIBLE_IDX = -1 # Not implemented -SERIALIZED_METADATA_IDX = -1 # Not implemented -TARGET_PLATFORM_IDX = -1 # Not implemented -REQUIRES_OUTPUT_ALLOCATOR_IDX = -1 # Not implemented -SERIALIZATION_LEN = -1 # Not implemented - -if ENABLED_FEATURES.torch_tensorrt_runtime: - ABI_TARGET_IDX = torch.ops.tensorrt.ABI_TARGET_IDX() # 0 - NAME_IDX = torch.ops.tensorrt.NAME_IDX() # 1 - DEVICE_IDX = torch.ops.tensorrt.DEVICE_IDX() # 2 - ENGINE_IDX = torch.ops.tensorrt.ENGINE_IDX() # 3 - INPUT_BINDING_NAMES_IDX = torch.ops.tensorrt.INPUT_BINDING_NAMES_IDX() # 4 - OUTPUT_BINDING_NAMES_IDX = torch.ops.tensorrt.OUTPUT_BINDING_NAMES_IDX() # 5 - HW_COMPATIBLE_IDX = torch.ops.tensorrt.HW_COMPATIBLE_IDX() # 6 - SERIALIZED_METADATA_IDX = torch.ops.tensorrt.SERIALIZED_METADATA_IDX() # 7 - TARGET_PLATFORM_IDX = torch.ops.tensorrt.TARGET_PLATFORM_IDX() # 8 - REQUIRES_OUTPUT_ALLOCATOR_IDX = ( - torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX() - ) # 9 - RESOURCE_ALLOCATION_STRATEGY_IDX = ( - torch.ops.tensorrt.RESOURCE_ALLOCATION_STRATEGY_IDX() - ) # 10 - SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN() # 11 - - -@for_all_methods(needs_torch_tensorrt_runtime) -class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] - """TorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - - This module is backed by the Torch-TensorRT runtime and is fully compatible with both - FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as - well as TorchScript / C++ deployments since TorchTensorRTModule can be passed to ``torch.jit.trace`` - and then saved. - - The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where - the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))`` - - > Note: TorchTensorRTModule only supports engines built with explicit batch - Attributes: - name (str): Name of module (for easier debugging) - engine (torch.classes.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling - input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules - output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned +class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] + """``nn.Module`` that runs a TensorRT engine inside PyTorch. + + Execution uses either the C++ Torch-TensorRT runtime (``torch.classes.tensorrt.Engine``) + or the Python TRT stack (``tensorrt`` + ``execute_engine_python``), depending on + :func:`~torch_tensorrt.runtime.get_runtime_backend` (set via + :func:`~torch_tensorrt.runtime.set_runtime_backend` as a context manager for scoped + changes). The backend is read from :func:`~torch_tensorrt.runtime.get_runtime_backend` + when the module is constructed (and from checkpoint metadata on load). + + Supports ``torch.save`` / ``torch.load`` via ``get_extra_state`` / ``set_extra_state``. + Extra state is a 5-tuple; the last element is ``runtime_backend`` (enum value as + ``str``) when an engine is saved, or ``None`` when there is no engine. If the fifth + element is missing (legacy 4-tuple with an engine), the C++ backend is used. """ def __init__( @@ -84,52 +78,24 @@ def __init__( output_binding_names: Optional[List[str]] = None, *, name: str = "", - settings: CompilationSettings = CompilationSettings(), # Assumes engine was built with default compilation settings if object not passed + settings: CompilationSettings = CompilationSettings(), weight_name_map: Optional[dict[Any, Any]] = None, requires_output_allocator: bool = False, symbolic_shape_expressions: Optional[Dict[str, List[Dict[str, Any]]]] = None, - ): - """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs - a PyTorch ``torch.nn.Module`` around it. Uses the Torch-TensorRT runtime extension to run the engines - - If binding names are not provided, it is assumed that the engine binding names follow the following convention: - - - [symbol].[index in input / output array] - - ex. [x.0, x.1, x.2] -> [y.0] - - Arguments: - serialized_engine (bytes): Serialized TensorRT engine in the form of a bytearray - input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules - output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned - - Keyword Arguments: - name (str): Name for module - settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed - weight_name_map (dict): Mapping of engine weight name to state_dict weight name - requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators) - symbolic_shape_expressions (List[Any]): List of symbolic shape expressions for each input binding - - Example: - - .. code-block:: py - - with io.BytesIO() as engine_bytes: - engine_bytes.write(trt_engine.serialize()) - engine_str = engine_bytes.getvalue() - - trt_module = TorchTensorRTModule( - engine_str, - input_binding_names=["x"], - output_binding_names=["output"], - name="my_module", - settings=CompilationSettings(device=torch.cuda.current_device) - ) + ) -> None: + """Build the module from serialized engine bytes and binding metadata. + Args: + serialized_engine: Raw TRT engine bytes (``None`` if restoring state only). + input_binding_names: Input tensor names in ``forward`` order. + output_binding_names: Output tensor names in return order. + name: Logical name for logging and serialization. + settings: Compilation/runtime settings (device, lazy init, cross-compile, etc.). + weight_name_map: Engine weight name to ``state_dict`` key mapping (refit). + requires_output_allocator: Engine needs TRT dynamic output allocation. + symbolic_shape_expressions: Optional symbolic shape metadata from compile. """ - super(TorchTensorRTModule, self).__init__() - - if not isinstance(serialized_engine, bytearray): - ValueError("Expected serialized engine as bytearray") + super().__init__() self.input_binding_names = ( input_binding_names if input_binding_names is not None else [] @@ -142,10 +108,17 @@ def __init__( self.settings = copy.deepcopy(settings) self.weight_name_map = weight_name_map self.serialized_engine = serialized_engine - self.engine = None + self.engine: Optional[Any] = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources self.symbolic_shape_expressions = symbolic_shape_expressions + self.target_platform = ( + Platform.current_platform() + if not self.settings.enable_cross_compile_for_windows + else Platform.WIN_X86_64 + ) + self._runtime_backend = get_runtime_backend() + self.profiling_enabled = False if ( serialized_engine @@ -154,6 +127,21 @@ def __init__( ): self.setup_engine() + def _require_engine(self) -> Any: + if self.engine is None: + raise RuntimeError("Engine has not been setup yet.") + return self.engine + + @property + def _is_python_runtime(self) -> bool: + return self._runtime_backend is RuntimeBackend.PYTHON + + def _cleanup_engine(self) -> None: + engine = getattr(self, "engine", None) + if engine is not None and hasattr(engine, "close"): + engine.close() + self.engine = None + def _pack_engine_info(self) -> List[str | bytes]: target_device = ( self.settings.device @@ -170,30 +158,36 @@ def _pack_engine_info(self) -> List[str | bytes]: else self.engine.are_output_tensors_unowned() ), } - target_platform = ( - Platform.current_platform() - if not self.settings.enable_cross_compile_for_windows - else Platform.WIN_X86_64 - ) # Change to match target for engine engine_info: List[str | bytes] = [""] * SERIALIZATION_LEN - engine_info[ABI_TARGET_IDX] = torch.ops.tensorrt.ABI_VERSION() + engine_info[ABI_TARGET_IDX] = ( + torch.ops.tensorrt.ABI_VERSION() + if ENABLED_FEATURES.torch_tensorrt_runtime + else ABI_VERSION + ) engine_info[NAME_IDX] = ( self.name + "_engine" if self.name != "" else "tensorrt_engine" ) - engine_info[DEVICE_IDX] = target_device._to_serialized_rt_device() - assert self.serialized_engine + engine_info[DEVICE_IDX] = ( + target_device._to_serialized_rt_device() + if ENABLED_FEATURES.torch_tensorrt_runtime + else serialize_device_info(target_device) + ) + assert self.serialized_engine is not None engine_info[ENGINE_IDX] = self.serialized_engine - - engine_info[INPUT_BINDING_NAMES_IDX] = TorchTensorRTModule._pack_binding_names( + engine_info[INPUT_BINDING_NAMES_IDX] = serialize_binding_names( self.input_binding_names ) - engine_info[OUTPUT_BINDING_NAMES_IDX] = TorchTensorRTModule._pack_binding_names( + engine_info[OUTPUT_BINDING_NAMES_IDX] = serialize_binding_names( self.output_binding_names ) engine_info[HW_COMPATIBLE_IDX] = str(int(self.hardware_compatible)) engine_info[SERIALIZED_METADATA_IDX] = self.encode_metadata(metadata) - engine_info[TARGET_PLATFORM_IDX] = target_platform._to_serialized_rt_platform() + engine_info[TARGET_PLATFORM_IDX] = ( + self.target_platform._to_serialized_rt_platform() + if ENABLED_FEATURES.torch_tensorrt_runtime + else str(self.target_platform) + ) engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str( int(self.requires_output_allocator) ) @@ -203,39 +197,37 @@ def _pack_engine_info(self) -> List[str | bytes]: engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = str( int(self.dynamically_allocate_resources) ) - return engine_info def get_streamable_device_memory_budget(self) -> Any: - return self.engine.streamable_device_memory_budget + return self._require_engine().streamable_device_memory_budget def get_automatic_device_memory_budget(self) -> Any: - return self.engine.automatic_device_memory_budget + return self._require_engine().automatic_device_memory_budget def get_device_memory_budget(self) -> Any: - return self.engine.device_memory_budget + return self._require_engine().device_memory_budget def set_device_memory_budget(self, budget_bytes: int) -> int: - # Disable weight streaming for invalid budget size + engine = self._require_engine() if budget_bytes < 0: budget_bytes = self.get_streamable_device_memory_budget() - self.engine.device_memory_budget = budget_bytes - if self.engine.device_memory_budget != budget_bytes: + engine.device_memory_budget = budget_bytes + if engine.device_memory_budget != budget_bytes: logger.error(f"Failed to set weight streaming budget to {budget_bytes}") - budget_bytes = self.engine.device_memory_budget + budget_bytes = engine.device_memory_budget if self.get_streamable_device_memory_budget() == budget_bytes: logger.warning("Weight streaming is disabled") - return budget_bytes def _reset_captured_graph(self) -> None: - self.engine.reset_captured_graph() + self._require_engine().reset_captured_graph() def use_dynamically_allocated_resources( self, dynamically_allocate_resources: bool = False ) -> None: self.dynamically_allocate_resources = dynamically_allocate_resources - self.engine.use_dynamically_allocated_resources( + self._require_engine().use_dynamically_allocated_resources( self.dynamically_allocate_resources ) @@ -250,6 +242,16 @@ def setup_engine(self) -> None: """ if self.engine is not None: return + + if self._is_python_runtime: + self.engine = PythonTRTEngine( + self._pack_engine_info(), + profile_execution=self.profiling_enabled, + ) + return + + if not ENABLED_FEATURES.torch_tensorrt_runtime: + raise NotImplementedError("Torch-TensorRT Runtime is not available") self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) def encode_metadata(self, metadata: Any) -> str: @@ -259,85 +261,123 @@ def encode_metadata(self, metadata: Any) -> str: return encoded_metadata @staticmethod - def decode_metadata(encoded_metadata: bytes) -> Any: - dumped_metadata = base64.b64decode(encoded_metadata.encode("utf-8")) - metadata = pickle.loads(dumped_metadata) - return metadata + def decode_metadata(encoded_metadata: bytes | str) -> Any: + if isinstance(encoded_metadata, str): + encoded_metadata = encoded_metadata.encode("utf-8") + return pickle.loads(base64.b64decode(encoded_metadata)) def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: - if self.engine: - return ( - self.name, - self.engine.__getstate__(), - self.input_binding_names, - self.output_binding_names, - ) - elif self.serialized_engine: + """Return payload for ``torch.save`` (engine blob base64-encoded in the packed list).""" + if self.engine or self.serialized_engine: engine_info = self._pack_engine_info() - assert isinstance(engine_info[3], bytes) - engine_info[ENGINE_IDX] = base64.b64encode(engine_info[3]) + raw_engine_blob = engine_info[ENGINE_IDX] + assert isinstance(raw_engine_blob, (bytes, bytearray)) + engine_info[ENGINE_IDX] = base64.b64encode(raw_engine_blob) return ( self.name, engine_info, self.input_binding_names, self.output_binding_names, + self._runtime_backend.value, ) - else: - return ( - self.name, - None, - self.input_binding_names, - self.output_binding_names, - ) + return ( + self.name, + None, + self.input_binding_names, + self.output_binding_names, + None, + ) - def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: + def set_extra_state(self, state: TorchTensorRTModuleExtraState) -> None: + """Restore module fields and engine from ``torch.load`` extra state.""" + self._cleanup_engine() self.name = state[0] - - if state[1] is not None: - serialized_engine_info: SerializedTensorRTEngineFmt = state[1] - serialized_engine_info[ENGINE_IDX] = base64.b64decode( - serialized_engine_info[ENGINE_IDX] - ) - self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) - self.hardware_compatible = bool( - int(serialized_engine_info[HW_COMPATIBLE_IDX]) - ) - self.requires_output_allocator = bool( - int(serialized_engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) + self.input_binding_names = state[2] + self.output_binding_names = state[3] + if len(state) not in (4, 5): + raise ValueError( + "Invalid TorchTensorRTModule extra_state: expected 4 (legacy) or 5 " + f"elements when engine_info is None, got {len(state)}" ) - serialized_metadata = serialized_engine_info[SERIALIZED_METADATA_IDX] - assert isinstance(serialized_metadata, bytes) - metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) - self.settings = metadata["settings"] - self.weight_name_map = metadata["weight_name_map"] - self.output_tensors_are_unowned = metadata["output_tensors_are_unowned"] - self.symbolic_shape_expressions = metadata["inout_symexprs"] - self.engine.set_output_tensors_as_unowned(self.output_tensors_are_unowned) - - else: - self.engine = None + if state[1] is None: + self.serialized_engine = None self.settings = CompilationSettings() + self.weight_name_map = None self.hardware_compatible = False + self.requires_output_allocator = False + self.dynamically_allocate_resources = False + self.symbolic_shape_expressions = None + self.target_platform = Platform.current_platform() + self.profiling_enabled = False + return - self.input_binding_names = state[2] - self.output_binding_names = state[3] + serialized_engine_info: SerializedTensorRTEngineFmt = list(state[1]) + metadata = TorchTensorRTModule.decode_metadata( + serialized_engine_info[SERIALIZED_METADATA_IDX] + ) + raw_backend = state[4] if len(state) == 5 else None + if raw_backend is None: + raw_backend = RuntimeBackend.CPP + runtime_backend = _normalize_runtime_backend(raw_backend) + self._runtime_backend = runtime_backend + + encoded_engine = serialized_engine_info[ENGINE_IDX] + decoded_engine = base64.b64decode(encoded_engine) + serialized_engine_info[ENGINE_IDX] = decoded_engine + self.serialized_engine = decoded_engine + self.hardware_compatible = bool(int(serialized_engine_info[HW_COMPATIBLE_IDX])) + self.requires_output_allocator = bool( + int(serialized_engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) + ) + self.dynamically_allocate_resources = bool( + int(serialized_engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX]) + ) + self.settings = metadata["settings"] + self.weight_name_map = metadata["weight_name_map"] + self.symbolic_shape_expressions = metadata["inout_symexprs"] + self.target_platform = ( + Platform.WIN_X86_64 + if self.settings.enable_cross_compile_for_windows + else Platform.current_platform() + ) + self.profiling_enabled = False + + if runtime_backend is RuntimeBackend.PYTHON: + self.engine = PythonTRTEngine(serialized_engine_info) + else: + if not ENABLED_FEATURES.torch_tensorrt_runtime: + raise NotImplementedError("Torch-TensorRT Runtime is not available") + self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + + self.engine.set_output_tensors_as_unowned( + metadata["output_tensors_are_unowned"] + ) + + def __del__(self) -> None: + self._cleanup_engine() def set_pre_allocated_outputs(self, enable: bool) -> None: - self.engine.use_pre_allocated_outputs = enable + self._require_engine().use_pre_allocated_outputs = enable def set_use_output_allocator(self, enable: bool) -> None: - self.engine.use_output_allocator_outputs = enable + self._require_engine().use_output_allocator_outputs = enable + + def _execute_engine(self, input_tensors: List[torch.Tensor]) -> List[torch.Tensor]: + """Dispatch to ``execute_engine`` or ``execute_engine_python``.""" + engine = self._require_engine() + if self._is_python_runtime: + return cast( + List[torch.Tensor], + torch.ops.tensorrt.execute_engine_python(list(input_tensors), engine), + ) + return cast( + List[torch.Tensor], + torch.ops.tensorrt.execute_engine(list(input_tensors), engine), + ) def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: - """Implementation of the forward pass for a TensorRT engine - - Args: - *inputs (Union[torch.Tensor, int]): Inputs to the forward function - - Returns: - torch.Tensor or Tuple(torch.Tensor): Result of the engine computation - """ + """Run the TensorRT engine on GPU tensors (non-tensor args are cast to CUDA tensors).""" if self.engine is None: raise RuntimeError("Engine has not been setup yet.") @@ -345,23 +385,13 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: self.input_binding_names ), f"Wrong number of inputs, expected {len(self.input_binding_names)} got {len(inputs)}." - # If the inputs are not Torch Tensors, which can occur in scenarios such as shape tensors - # which are outputs of a preceding Torch subgraph (where the Dynamic input may be an integer) - # directly cast the input to a Torch Tensor. - # - # This also avoids the need for type-checking inputs, since they are now explicitly casted to Torch tensors input_tensors: List[torch.Tensor] = [ - (i if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) - for i in inputs + (value if isinstance(value, torch.Tensor) else torch.tensor(value).cuda()) + for value in inputs ] - - outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine( - list(input_tensors), self.engine - ) - + outputs = self._execute_engine(input_tensors) if len(outputs) == 1: return outputs[0] - return tuple(outputs) def enable_profiling( @@ -369,57 +399,37 @@ def enable_profiling( profiling_results_dir: Optional[str] = None, profile_format: str = "perfetto", ) -> None: - """Enable the profiler to collect latency information about the execution of the engine - - Traces can be visualized using https://ui.perfetto.dev/ or compatible alternatives - - Keyword Arguments: - profiling_results_dir (str): Absolute path to the directory to sort results of profiling. - """ + """Enable engine profiling (C++: optional Perfetto/TREx path prefix on disk).""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") - if profiling_results_dir is not None: + if not self._is_python_runtime and profiling_results_dir is not None: self.engine.profile_path_prefix = profiling_results_dir - assert profile_format in ["trex", "perfetto"] + self.engine.enable_profiling() - self.engine.set_profile_format(profile_format) + if hasattr(self.engine, "set_profile_format"): + self.engine.set_profile_format(profile_format) + self.profiling_enabled = True def set_output_tensors_as_unowned(self, enabled: bool) -> None: - self.engine.set_output_tensors_as_unowned(enabled) + self._require_engine().set_output_tensors_as_unowned(enabled) def are_output_tensors_unowned(self) -> bool: - return self.engine.are_output_tensors_unowned() # type: ignore[no-any-return] + return cast(bool, self._require_engine().are_output_tensors_unowned()) def disable_profiling(self) -> None: - """Disable the profiler""" + """Disable engine profiling and clear the profiling flag on this module.""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") - self.engine.disable_profiling() + self.profiling_enabled = False def get_layer_info(self) -> str: - """Get a JSON string containing the layer information encoded by the TensorRT engine in this module - - Returns: - - str: A JSON string which contains the layer information of the engine incapsulated in this module - """ - if self.engine is None: - raise RuntimeError("Engine has not been initialized yet.") - - layer_info: str = self.engine.get_engine_layer_info() - return layer_info + """Return TRT layer information as a JSON string (TRT version dependent).""" + return cast(str, self._require_engine().get_engine_layer_info()) def dump_layer_info(self) -> None: - """Dump layer information encoded by the TensorRT engine in this module to STDOUT""" + """Print layer information for this engine to stdout.""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") - self.engine.dump_engine_layer_info() - - @staticmethod - def _pack_binding_names(binding_names: List[str]) -> str: - delim = torch.ops.tensorrt.SERIALIZED_ENGINE_BINDING_DELIM()[0] - packed_bindings: str = delim.join(binding_names) - return packed_bindings diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py index 0eb66b24b0..93576ec0dd 100644 --- a/py/torch_tensorrt/dynamo/runtime/__init__.py +++ b/py/torch_tensorrt/dynamo/runtime/__init__.py @@ -1,7 +1,4 @@ import torch_tensorrt -from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( # noqa: F401 - PythonTorchTensorRTModule, -) from torch_tensorrt.dynamo.runtime._ResourceAllocator import ( # noqa: F401 ResourceAllocationStrategy, ) diff --git a/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py new file mode 100644 index 0000000000..72cacf9d49 --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py @@ -0,0 +1,70 @@ +"""Serialized TensorRT engine blob layout shared by C++ and Python runtimes. + +Field order and indices must stay aligned with the Torch-TensorRT C++ engine +packing (e.g. ``TRTEngine`` / ``register_jit_hooks``). Python-only builds use +this module instead of ``torch.ops.tensorrt.*_IDX()`` helpers. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Union + +import tensorrt as trt +import torch +import torch_tensorrt + +ABI_VERSION = "8" +ABI_TARGET_IDX = 0 +NAME_IDX = 1 +DEVICE_IDX = 2 +ENGINE_IDX = 3 +INPUT_BINDING_NAMES_IDX = 4 +OUTPUT_BINDING_NAMES_IDX = 5 +HW_COMPATIBLE_IDX = 6 +SERIALIZED_METADATA_IDX = 7 +TARGET_PLATFORM_IDX = 8 +REQUIRES_OUTPUT_ALLOCATOR_IDX = 9 +RESOURCE_ALLOCATION_STRATEGY_IDX = 10 +SERIALIZATION_LEN = 11 + +SERIALIZED_ENGINE_BINDING_DELIM = "%" +SERIALIZED_RT_DEVICE_DELIM = "%" + +SerializedTensorRTEngineFmt = List[Union[str, bytes]] + + +def serialize_binding_names(binding_names: List[str]) -> str: + return SERIALIZED_ENGINE_BINDING_DELIM.join(binding_names) + + +def deserialize_binding_names(binding_names: str) -> List[str]: + return binding_names.split(SERIALIZED_ENGINE_BINDING_DELIM) if binding_names else [] + + +def serialize_device_info(device: torch_tensorrt.Device) -> str: + dev_info = torch.cuda.get_device_properties(device.gpu_id) + rt_info = [ + device.gpu_id, + dev_info.major, + dev_info.minor, + int(device.device_type.to(trt.DeviceType)), + dev_info.name, + ] + return SERIALIZED_RT_DEVICE_DELIM.join(str(value) for value in rt_info) + + +def parse_device_info(serialized_device_info: str) -> Dict[str, Any]: + tokens = serialized_device_info.split(SERIALIZED_RT_DEVICE_DELIM) + if len(tokens) != 5: + raise RuntimeError( + f"Unable to deserialize program target device information: {serialized_device_info}" + ) + + target_device_id = int(tokens[0]) + return { + "id": target_device_id, + "major": int(tokens[1]), + "minor": int(tokens[2]), + "device_type": int(tokens[3]), + "name": tokens[4], + } diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index e03c88153c..618d743a7e 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -3,11 +3,57 @@ from typing import Any, Dict, List import torch +from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + SERIALIZED_METADATA_IDX, +) from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule logger = logging.getLogger(__name__) +def _trt_metadata_blob(engine: Any) -> Any: + """Base64 pickle blob from engine metadata (fake/meta execution). + + 1) Call ``get_serialized_metadata()`` on the wrapper, then on ``real_obj``. + 2) If missing (some C++ engines in tracing), read the same field from pickle state. + """ + unwrapped = getattr(engine, "real_obj", None) + + for obj in (engine, unwrapped): + if obj is None: + continue + get_meta = getattr(obj, "get_serialized_metadata", None) + if callable(get_meta): + return get_meta() + + # C++ torch.classes.tensorrt.Engine: metadata lives in __getstate__()[0][SERIALIZED_METADATA_IDX] + for obj in (unwrapped, engine): + if obj is None: + continue + getstate = getattr(obj, "__getstate__", None) + if not callable(getstate): + continue + try: + outer = getstate() + packed = outer[0] if outer else None + if ( + isinstance(packed, (list, tuple)) + and len(packed) > SERIALIZED_METADATA_IDX + ): + blob = packed[SERIALIZED_METADATA_IDX] + if blob: + return blob + except (TypeError, IndexError, AttributeError): + continue + + raise RuntimeError("TensorRT meta kernel: could not read engine metadata") + + +def _shape_info_from_trt_engine(engine: Any) -> Any: + metadata = TorchTensorRTModule.decode_metadata(_trt_metadata_blob(engine)) + return metadata.get("inout_symexprs") if metadata else None + + def _apply_symbolic_shape_expressions( inputs: List[torch.Tensor], shape_info: Dict[str, List[Dict[str, Any]]] ) -> List[torch.Tensor]: @@ -200,19 +246,7 @@ def fake_tensorrt_execute_engine( output shapes while preserving symbolic SymInt relationships. """ - metadata = None - if hasattr(fake_trt_engine, "real_obj"): - # Wrapped C++ engine with real_obj - trt_engine = fake_trt_engine.real_obj - metadata = TorchTensorRTModule.decode_metadata( - trt_engine.get_serialized_metadata() - ) - else: - metadata = TorchTensorRTModule.decode_metadata( - fake_trt_engine.get_serialized_metadata() - ) - - shape_info = metadata.get("inout_symexprs") if metadata else None + shape_info = _shape_info_from_trt_engine(fake_trt_engine) if shape_info: # Apply the symbolic shape expressions to create output fake tensors @@ -226,6 +260,31 @@ def fake_tensorrt_execute_engine( ) +@torch.library.register_fake("tensorrt::execute_engine_python") # type: ignore +def fake_tensorrt_execute_engine_python(inputs: List[torch.Tensor], engine: Any) -> Any: + shape_info = _shape_info_from_trt_engine(engine) + + if shape_info: + return _apply_symbolic_shape_expressions(inputs, shape_info) + + real = getattr(engine, "real_obj", None) + for o in (engine, real): + if o is None: + continue + shapes, dtypes = getattr(o, "output_shapes", None), getattr( + o, "output_dtypes", None + ) + if shapes and dtypes: + return [ + torch.empty(s, dtype=d, device=inputs[0].device) + for s, d in zip(shapes, dtypes) + ] + + raise RuntimeError( + "No output shape information found for tensorrt::execute_engine_python." + ) + + @torch._library.register_fake_class("tensorrt::Engine") class FakeTRTEngine: def __init__(self, engine_info: List[str]) -> None: diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 0de257f7c6..f563d5fc44 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -138,36 +138,20 @@ def deallocate_module(module: torch.fx.GraphModule) -> None: gc.collect() -def use_python_runtime_parser(use_python_runtime: Optional[bool] = None) -> bool: - """Parses a user-provided input argument regarding Python runtime - - Automatically handles cases where the user has not specified a runtime (None) - - Returns True if the Python runtime should be used, False if the C++ runtime should be used - """ - using_python_runtime = use_python_runtime - reason = "" - - # Runtime was manually specified by the user - if using_python_runtime is not None: - reason = "as requested by user" - # Runtime was not manually specified by the user, automatically detect runtime - else: - try: - from torch_tensorrt.dynamo.runtime import TorchTensorRTModule # noqa: F401 - - using_python_runtime = False - reason = "since C++ dependency was detected as present" - except ImportError: - using_python_runtime = True - reason = "since import failed, C++ dependency not installed" +def _log_torch_compile_runtime_backend() -> None: + """Log which TRT runtime backend applies for a ``torch.compile`` / Dynamo compile.""" + from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( + RuntimeBackend, + get_runtime_backend, + ) + backend = get_runtime_backend() + using_python = backend is RuntimeBackend.PYTHON logger.info( - f"Using {'Python-only' if using_python_runtime else 'Default'} Torch-TRT Runtime ({reason})" + f"Using {'Python-only' if using_python else 'Default'} Torch-TRT Runtime " + f"(from runtime backend selection: {backend})" ) - return using_python_runtime - def cosine_similarity(gt_tensor: torch.Tensor, pred_tensor: torch.Tensor) -> float: gt_tensor = gt_tensor.flatten().to(torch.float32) @@ -590,6 +574,15 @@ def parse_dynamo_kwargs( if "options" in kwargs and len(kwargs) == 1: kwargs = kwargs["options"] + if "use_python_runtime" in kwargs: + warnings.warn( + 'torch.compile option "use_python_runtime" was removed; use ' + 'torch_tensorrt.runtime.set_runtime_backend("python"|"cpp") instead.', + DeprecationWarning, + stacklevel=2, + ) + kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} + if "truncate_long_and_double" in kwargs: if ( "truncate_double" in kwargs @@ -627,8 +620,7 @@ def parse_dynamo_kwargs( settings.enabled_precisions = enabled_precisions - # Parse input runtime specification - settings.use_python_runtime = use_python_runtime_parser(settings.use_python_runtime) + _log_torch_compile_runtime_backend() # Ensure device is a torch_tensorrt Device settings.device = to_torch_tensorrt_device(settings.device) diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py index cfc9b322b5..4b9e5146bb 100644 --- a/py/torch_tensorrt/runtime/__init__.py +++ b/py/torch_tensorrt/runtime/__init__.py @@ -1,7 +1,11 @@ from torch_tensorrt.dynamo.runtime import ( # noqa: F401 - PythonTorchTensorRTModule, TorchTensorRTModule, ) +from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( + RuntimeBackend, + get_runtime_backend, + set_runtime_backend, +) from torch_tensorrt.runtime._cudagraphs import ( enable_cudagraphs, get_cudagraphs_mode, diff --git a/py/torch_tensorrt/runtime/_output_allocator.py b/py/torch_tensorrt/runtime/_output_allocator.py index 163fc26306..6eb67b7218 100644 --- a/py/torch_tensorrt/runtime/_output_allocator.py +++ b/py/torch_tensorrt/runtime/_output_allocator.py @@ -2,7 +2,7 @@ from typing import Any, Union import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( CudaGraphsTorchTensorRTModule, ) @@ -24,9 +24,7 @@ def __init__( rt_mods = [] for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append(rt_mod) self.rt_mods = rt_mods diff --git a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py index c392c38838..7c3629b28a 100644 --- a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py +++ b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py @@ -2,7 +2,7 @@ from typing import Any import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule logger = logging.getLogger(__name__) @@ -15,9 +15,7 @@ class _PreAllocatedOutputContextManager(object): def __init__(self, module: torch.fx.GraphModule) -> None: rt_mods = [] for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append(rt_mod) self.rt_mods = rt_mods diff --git a/py/torch_tensorrt/runtime/_utils.py b/py/torch_tensorrt/runtime/_utils.py index bc2e5a6a70..929d88f8af 100644 --- a/py/torch_tensorrt/runtime/_utils.py +++ b/py/torch_tensorrt/runtime/_utils.py @@ -1,9 +1,18 @@ import logging -from typing import List, Optional, Tuple +from typing import Optional, Protocol, Tuple import torch import torch_tensorrt + +class _ComparableDeviceProps(Protocol): + """Enough for multi-device checks; may be ``_CudaDeviceProperties`` or a simple namespace.""" + + major: int + minor: int + name: object + + logger = logging.getLogger(__name__) @@ -27,7 +36,7 @@ def _is_switch_required( curr_device_id: int, engine_device_id: int, curr_device_properties: torch._C._CudaDeviceProperties, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> bool: """Determines whether a device switch is required based on input device parameters""" # Device Capabilities disagree @@ -66,7 +75,7 @@ def _is_switch_required( def _select_rt_device( curr_device_id: int, engine_device_id: int, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> Tuple[int, torch._C._CudaDeviceProperties]: """Wraps compatible device check and raises error if none are found""" new_target_device_opt = _get_most_compatible_device( @@ -83,7 +92,7 @@ def _select_rt_device( def _get_most_compatible_device( curr_device_id: int, engine_device_id: int, - engine_device_properties: torch._C._CudaDeviceProperties, + engine_device_properties: _ComparableDeviceProps, ) -> Optional[Tuple[int, torch._C._CudaDeviceProperties]]: """Selects a runtime device based on compatibility checks""" all_devices = [ diff --git a/py/torch_tensorrt/runtime/_weight_streaming.py b/py/torch_tensorrt/runtime/_weight_streaming.py index 0874d31d11..d294da1731 100755 --- a/py/torch_tensorrt/runtime/_weight_streaming.py +++ b/py/torch_tensorrt/runtime/_weight_streaming.py @@ -2,7 +2,7 @@ from typing import Any, Union import torch -from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule +from torch_tensorrt.dynamo.runtime import TorchTensorRTModule from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( CudaGraphsTorchTensorRTModule, ) @@ -26,9 +26,7 @@ def __init__( self.cuda_graphs_module = module module = module.compiled_module for name, rt_mod in module.named_children(): - if "_run_on_acc" in name and isinstance( - rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) - ): + if "_run_on_acc" in name and isinstance(rt_mod, TorchTensorRTModule): rt_mods.append((name, rt_mod)) self.current_device_budget += rt_mod.get_device_memory_budget() self.streamable_budget = [ diff --git a/tests/py/dynamo/backend/test_backend_compiler.py b/tests/py/dynamo/backend/test_backend_compiler.py index 6369d3805c..c57ef99ca1 100644 --- a/tests/py/dynamo/backend/test_backend_compiler.py +++ b/tests/py/dynamo/backend/test_backend_compiler.py @@ -49,7 +49,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -129,7 +128,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = model(*inputs).detach().cpu() @@ -170,7 +168,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, optimization_level=4, version_compatible=True, max_aux_streams=5, diff --git a/tests/py/dynamo/conversion/test_index_put_aten.py b/tests/py/dynamo/conversion/test_index_put_aten.py index a7b58d0d3c..58fce5172f 100644 --- a/tests/py/dynamo/conversion/test_index_put_aten.py +++ b/tests/py/dynamo/conversion/test_index_put_aten.py @@ -353,7 +353,6 @@ def forward(self, source_tensor, indices_tensor, value_tensor): use_explicit_typing=False, use_fp32_acc=False, disable_tf32=True, - use_python_runtime=True, ) result = trt_engine(source_tensor, indices_tensor, value_tensor) diff --git a/tests/py/dynamo/distributed/test_distributed_simple_example.py b/tests/py/dynamo/distributed/test_distributed_simple_example.py index 202469e2ea..3aadfa75c3 100644 --- a/tests/py/dynamo/distributed/test_distributed_simple_example.py +++ b/tests/py/dynamo/distributed/test_distributed_simple_example.py @@ -68,7 +68,6 @@ def forward(self, x): options={ "truncate_long_and_double": True, "enabled_precisions": {torch.float32, torch.float16}, - "use_python_runtime": True, "min_block_size": 1, "use_distributed_mode_trace": True, }, diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py index ccfbf06268..51b75dc98d 100644 --- a/tests/py/dynamo/lowering/test_aten_lowering_passes.py +++ b/tests/py/dynamo/lowering/test_aten_lowering_passes.py @@ -271,7 +271,7 @@ def forward(self, x: torch.Tensor): trt_module = torch.compile( model, backend="tensorrt", - options={"use_python_runtime": False, "min_block_size": 1}, + options={"min_block_size": 1}, ) out = trt_module(inputs) # if the model can be successfully compiled, we regard the test as passed diff --git a/tests/py/dynamo/models/test_autocast.py b/tests/py/dynamo/models/test_autocast.py index f3677935ff..eabefa75be 100644 --- a/tests/py/dynamo/models/test_autocast.py +++ b/tests/py/dynamo/models/test_autocast.py @@ -52,7 +52,6 @@ def forward(self, x): ep.module(), arg_inputs=inputs, min_block_size=1, - use_python_runtime=True, use_explicit_typing=True, enable_autocast=True, autocast_low_precision_type=torch.float16, @@ -143,7 +142,6 @@ def forward(self, x): ep, arg_inputs=inputs, min_block_size=1, - use_python_runtime=True, use_explicit_typing=True, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, @@ -223,7 +221,6 @@ def forward(self, x): ep.module(), arg_inputs=inputs, min_block_size=1, - use_python_runtime=False, use_explicit_typing=True, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, @@ -331,7 +328,6 @@ def forward(self, x, y): ep, arg_inputs=inputs, min_block_size=1, - use_python_runtime=False, use_explicit_typing=True, # Torch-TensorRT's autocast doesn't affect layers inside Pytorch autocast enable_autocast=True, diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index 6c02db6b68..f638447b8e 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -41,7 +41,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=True, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, @@ -82,7 +81,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=True, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, @@ -129,7 +127,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=False, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, @@ -171,7 +168,6 @@ def forward(self, x): pass_through_build_failures=True, truncate_double=False, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, @@ -230,7 +226,6 @@ def forward(self, x): inputs=[in_tensor], pass_through_build_failures=True, min_block_size=1, - use_python_runtime=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, @@ -270,7 +265,6 @@ def forward(self, x): inputs=[in_tensor], pass_through_build_failures=True, min_block_size=1, - use_python_runtime=True, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index f17c375489..4ee09617d2 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -234,7 +234,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -309,7 +308,6 @@ def test_dynamo_compile_with_custom_engine_cache(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, use_explicit_typing=False, enabled_precisions={torch.float}, min_block_size=1, @@ -368,7 +366,6 @@ def test_dynamo_compile_change_input_shape(self): trt_gm = torch_trt.dynamo.compile( torch.export.export(model, args=inputs), inputs=inputs, - use_python_runtime=False, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -429,7 +426,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): model, backend="tensorrt", options={ - "use_python_runtime": False, "use_explicit_typing": False, "enabled_precisions": {torch.float}, "min_block_size": 1, @@ -495,7 +491,6 @@ def test_torch_compile_with_custom_engine_cache(self): model, backend="tensorrt", options={ - "use_python_runtime": False, "use_explicit_typing": False, "enabled_precisions": {torch.float}, "min_block_size": 1, @@ -552,7 +547,6 @@ def test_torch_trt_compile_change_input_shape(self): model, inputs=inputs, **{ - "use_python_runtime": True, "use_explicit_typing": False, "enabled_precisions": {torch.float}, "min_block_size": 1, @@ -595,7 +589,6 @@ def forward(self, x): model, backend="tensorrt", options={ - "use_python_runtime": True, "use_explicit_typing": False, "enabled_precisions": {torch.float}, "min_block_size": 1, @@ -696,7 +689,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, inputs, - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -748,7 +740,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, cache_built_engines=cache_built_engines, @@ -925,7 +916,6 @@ def remove_timing_cache(path=timing_cache_path): trt_gm = torch_trt.dynamo.compile( llama2_ep, inputs=[input_ids], - use_python_runtime=True, enabled_precisions={torch.float32}, min_block_size=1, immutable_weights=False, @@ -978,7 +968,6 @@ def remove_timing_cache(path=timing_cache_path): trt_gm = torch_trt.dynamo.compile( llama2_ep, inputs=[input_ids], - use_python_runtime=True, enabled_precisions={torch.float32}, min_block_size=1, truncate_double=True, diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py index 1bdbd2dc60..627acd352e 100644 --- a/tests/py/dynamo/models/test_model_refit.py +++ b/tests/py/dynamo/models/test_model_refit.py @@ -3,6 +3,7 @@ import unittest import pytest +import tensorrt as trt import torch import torch.nn.functional as F import torch_tensorrt as torchtrt @@ -20,8 +21,6 @@ ) from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt - assertions = unittest.TestCase() if importlib.util.find_spec("torchvision"): @@ -51,19 +50,17 @@ def test_mapping(): ] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) settings = trt_gm._run_on_acc_0.settings runtime = trt.Runtime(TRT_LOGGER) @@ -117,19 +114,17 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -182,19 +177,17 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -247,19 +240,17 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -303,19 +294,17 @@ def test_refit_one_engine_with_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -359,19 +348,17 @@ def test_refit_one_engine_no_map_with_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) trt_gm._run_on_acc_0.weight_name_map = None @@ -416,19 +403,17 @@ def test_refit_one_engine_with_wrong_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) # Manually Deleted all batch norm layer. This suppose to fail the fast refit trt_gm._run_on_acc_0.weight_name_map = { k: v @@ -482,19 +467,17 @@ def test_refit_one_engine_bert_with_weightmap(): nn.init.xavier_normal_(model2.embeddings.word_embeddings.weight) enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -543,19 +526,17 @@ def test_refit_one_engine_inline_runtime_with_weightmap(tmpdir): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs), strict=False) exp_program2 = torch.export.export(model2, tuple(inputs), strict=False) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) torchtrt.save(trt_gm, trt_ep_path, arg_inputs=inputs, retrace=True) trt_gm = torch.export.load(trt_ep_path) @@ -597,19 +578,17 @@ def test_refit_one_engine_python_runtime_with_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -668,22 +647,20 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) torch_executed_ops = {"torch.ops.aten.convolution.default"} - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - torch_executed_ops=torch_executed_ops, - reuse_cached_engines=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + torch_executed_ops=torch_executed_ops, + reuse_cached_engines=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -734,23 +711,21 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) torch_executed_ops = {"torch.ops.aten.convolution.default"} - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - torch_executed_ops=torch_executed_ops, - reuse_cached_engines=False, - offload_module_to_cpu=True, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + torch_executed_ops=torch_executed_ops, + reuse_cached_engines=False, + offload_module_to_cpu=True, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -792,19 +767,17 @@ def test_refit_one_engine_without_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -852,19 +825,17 @@ def test_refit_one_engine_bert_without_weightmap(): nn.init.xavier_normal_(model2.embeddings.word_embeddings.weight) enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -912,19 +883,17 @@ def test_refit_one_engine_inline_runtime_without_weightmap(tmpdir): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) torchtrt.save(trt_gm, trt_ep_path, arg_inputs=inputs) trt_gm = torch.export.load(trt_ep_path) new_trt_gm = refit_module_weights( @@ -964,19 +933,17 @@ def test_refit_one_engine_python_runtime_without_weightmap(): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = True - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - ) + with torchtrt.runtime.set_runtime_backend("python"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, @@ -1035,22 +1002,20 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} min_block_size = 1 - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) torch_executed_ops = {"torch.ops.aten.convolution.default"} - trt_gm = torchtrt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - enabled_precisions=enabled_precisions, - min_block_size=min_block_size, - immutable_weights=False, - torch_executed_ops=torch_executed_ops, - reuse_cached_engines=False, - ) + with torchtrt.runtime.set_runtime_backend("cpp"): + trt_gm = torchtrt.dynamo.compile( + exp_program, + tuple(inputs), + enabled_precisions=enabled_precisions, + min_block_size=min_block_size, + immutable_weights=False, + torch_executed_ops=torch_executed_ops, + reuse_cached_engines=False, + ) new_trt_gm = refit_module_weights( compiled_module=trt_gm, diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index b1435540e0..74b4dd8287 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -508,18 +508,18 @@ def forward( dynamic_shapes=({2: num_latent_frames},), # Make dimension 2 dynamic strict=False, ) - trt_model = torchtrt.dynamo.compile( - ep, - inputs=(hidden_states,), - enabled_precisions={torch.bfloat16}, - use_explicit_typing=False, - use_fp32_acc=False, - device="cuda:0", - disable_tf32=True, - use_python_runtime=True, - min_block_size=1, - ) - trt_output = trt_model(hidden_states) + with torchtrt.runtime.set_runtime_backend("python"): + trt_model = torchtrt.dynamo.compile( + ep, + inputs=(hidden_states,), + enabled_precisions={torch.bfloat16}, + use_explicit_typing=False, + use_fp32_acc=False, + device="cuda:0", + disable_tf32=True, + min_block_size=1, + ) + trt_output = trt_model(hidden_states) cos_sim = cosine_similarity(pyt_output, trt_output) assertions.assertTrue( diff --git a/tests/py/dynamo/models/test_symint_scalar_input.py b/tests/py/dynamo/models/test_symint_scalar_input.py index e6861b193f..7d2b52dda7 100644 --- a/tests/py/dynamo/models/test_symint_scalar_input.py +++ b/tests/py/dynamo/models/test_symint_scalar_input.py @@ -20,8 +20,8 @@ @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_from_size_used_in_reshape(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_from_size_used_in_reshape(runtime_backend): """ Test that a SymInt derived from tensor.size(0) can be used in reshape when it becomes a scalar placeholder input to the TRT subgraph. @@ -49,10 +49,10 @@ def forward(self, x, targets): "enabled_precisions": {torch.float}, "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + with torchtrt.runtime.set_runtime_backend(runtime_backend): + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) output_ref = model(x, targets) output_trt = trt_model(x, targets) @@ -60,15 +60,15 @@ def forward(self, x, targets): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"SymInt reshape test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"SymInt reshape test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_scalar_tensor_input(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_scalar_tensor_input(runtime_backend): """ Test that a 0-dim scalar tensor input (e.g., cache_length) is handled correctly during symbolic shape extraction and TRT compilation. @@ -87,10 +87,10 @@ def forward(self, x, offset): "enabled_precisions": {torch.float}, "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + with torchtrt.runtime.set_runtime_backend(runtime_backend): + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) output_ref = model(x, offset) output_trt = trt_model(x, offset) @@ -98,15 +98,15 @@ def forward(self, x, offset): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Scalar tensor input test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"Scalar tensor input test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_with_index_and_reshape(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_with_index_and_reshape(runtime_backend): """ Full reproduction of issue #4107 pattern: symbolic size from int64 tensor, used with index operation and reshape. @@ -141,10 +141,10 @@ def forward(self, x, targets, cache_length): "min_block_size": 1, "truncate_double": True, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + with torchtrt.runtime.set_runtime_backend(runtime_backend): + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) output_ref = model(x, targets, cache_length) output_trt = trt_model(x, targets, cache_length) @@ -152,15 +152,15 @@ def forward(self, x, targets, cache_length): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Issue 4107 repro test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}", + msg=f"Issue 4107 repro test (runtime_backend={runtime_backend}) failed. Cosine sim: {cos_sim}", ) torch._dynamo.reset() @pytest.mark.unit -@pytest.mark.parametrize("use_python_runtime", [True, False]) -def test_symint_with_different_batch_sizes(use_python_runtime): +@pytest.mark.parametrize("runtime_backend", ["python", "cpp"]) +def test_symint_with_different_batch_sizes(runtime_backend): """ Test that after compilation with a SymInt scalar input, the model produces correct results with different batch sizes. @@ -183,10 +183,10 @@ def forward(self, x, targets): "enabled_precisions": {torch.float}, "min_block_size": 1, "pass_through_build_failures": True, - "use_python_runtime": use_python_runtime, } - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + with torchtrt.runtime.set_runtime_backend(runtime_backend): + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) for batch_size in [4, 8, 16]: x_test = torch.randn(batch_size, 64).cuda() @@ -198,7 +198,7 @@ def forward(self, x, targets): cos_sim = cosine_similarity(output_ref, output_trt) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"Varying batch size test (python_runtime={use_python_runtime}) failed at B={batch_size}. Cosine sim: {cos_sim}", + msg=f"Varying batch size test (runtime_backend={runtime_backend}) failed at B={batch_size}. Cosine sim: {cos_sim}", ) torch._dynamo.reset() diff --git a/tests/py/dynamo/models/test_weight_stripped_engine.py b/tests/py/dynamo/models/test_weight_stripped_engine.py index 6bf1b58f71..42c5235cf6 100644 --- a/tests/py/dynamo/models/test_weight_stripped_engine.py +++ b/tests/py/dynamo/models/test_weight_stripped_engine.py @@ -34,7 +34,6 @@ def test_three_ways_to_compile(self): exp_program = torch.export.export(pyt_model, example_inputs) settings = { - "use_python_runtime": False, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, @@ -84,7 +83,6 @@ def test_compile_weight_stripped_engine(self): example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) settings = { - "use_python_runtime": False, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, @@ -171,7 +169,6 @@ def test_weight_stripped_engine_results(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -196,7 +193,6 @@ def test_weight_stripped_engine_results(self): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, @@ -243,7 +239,6 @@ def test_engine_caching_saves_weight_stripped_engine(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(example_inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -321,7 +316,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -405,7 +399,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, @@ -482,7 +475,6 @@ def forward(self, x): pyt_model, backend="tensorrt", options={ - "use_python_runtime": True, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, @@ -525,7 +517,6 @@ def forward(self, x): inputs=tuple(inputs), min_block_size=1, immutable_weights=False, - use_python_runtime=True, strip_engine_weights=True, refit_identical_engine_weights=False, ) @@ -566,15 +557,16 @@ def test_two_TRTRuntime_in_refitting(self): else: use_python_runtime = False - trt_gm = torch_trt.dynamo.compile( - exp_program, - tuple(inputs), - use_python_runtime=use_python_runtime, - min_block_size=1, - immutable_weights=False, - strip_engine_weights=True, - refit_identical_engine_weights=False, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_trt.runtime.set_runtime_backend(backend): + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + min_block_size=1, + immutable_weights=False, + strip_engine_weights=True, + refit_identical_engine_weights=False, + ) output = trt_gm(*inputs) assertions.assertEqual(output.sum(), 0, msg="results should be all zeros") @@ -608,7 +600,6 @@ def test_refit_identical_engine_weights(self): trt_gm = torch_trt.dynamo.compile( exp_program, tuple(example_inputs), - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -661,7 +652,6 @@ def test_refit_weight_stripped_engine_multiple_times(self): trt_gm = torch_trt.dynamo.compile( exp_program, inputs, - use_python_runtime=True, enabled_precisions={torch.float}, min_block_size=1, immutable_weights=False, @@ -702,7 +692,6 @@ def test_refit_weight_stripped_engine_multiple_times(self): pyt_model, backend="tensorrt", options={ - "use_python_runtime": False, "enabled_precisions": {torch.float}, "min_block_size": 1, "immutable_weights": False, diff --git a/tests/py/dynamo/partitioning/test_000_resource_partitioning.py b/tests/py/dynamo/partitioning/test_000_resource_partitioning.py index e7d7f4a390..bfa01c8e41 100644 --- a/tests/py/dynamo/partitioning/test_000_resource_partitioning.py +++ b/tests/py/dynamo/partitioning/test_000_resource_partitioning.py @@ -45,12 +45,10 @@ def forward(self, x): inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] enabled_precisions = {torch.float} - use_python_runtime = False exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, diff --git a/tests/py/dynamo/partitioning/test_001_resource_partitioning.py b/tests/py/dynamo/partitioning/test_001_resource_partitioning.py index e7d62c7a8d..c76a17aded 100644 --- a/tests/py/dynamo/partitioning/test_001_resource_partitioning.py +++ b/tests/py/dynamo/partitioning/test_001_resource_partitioning.py @@ -58,12 +58,9 @@ def forward(self, x): inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] enabled_precisions = {torch.float} - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, @@ -144,12 +141,9 @@ def forward(self, x): inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] enabled_precisions = {torch.float} - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, @@ -281,12 +275,9 @@ def forward(self, x): inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] enabled_precisions = {torch.float} - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, @@ -385,12 +376,9 @@ def forward(self, x): inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")] enabled_precisions = {torch.float} - use_python_runtime = False - exp_program = torch.export.export(model, tuple(inputs)) compilation_options = { - "use_python_runtime": use_python_runtime, "enabled_precisions": enabled_precisions, "min_block_size": 1, "immutable_weights": True, diff --git a/tests/py/dynamo/runtime/test_000_python_runtime.py b/tests/py/dynamo/runtime/test_000_python_runtime.py index 0f94f6a704..0d0ace20aa 100644 --- a/tests/py/dynamo/runtime/test_000_python_runtime.py +++ b/tests/py/dynamo/runtime/test_000_python_runtime.py @@ -26,7 +26,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -57,7 +56,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -92,7 +90,6 @@ def forward(self, x, y): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, offload_module_to_cpu=True, ) fx_graph.cuda() diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py index 622c928885..6b73d8c7e5 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py @@ -61,7 +61,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, ) result_samples = [] @@ -103,7 +102,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -148,7 +146,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, ) result_samples = [] @@ -191,7 +188,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -244,7 +240,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=False, ) result_samples = [] diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py index 2c7806c310..3d128dda00 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py @@ -37,7 +37,6 @@ def forward(self, input): "torch_compile", inputs, min_block_size=1, - use_python_runtime=True, ) with torch_tensorrt.runtime.enable_cudagraphs(optimized_model) as _: self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode()) @@ -60,7 +59,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) result_samples = [] @@ -104,7 +102,6 @@ def forward(self, x): inputs[0], min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -150,7 +147,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, ) result_samples = [] @@ -194,7 +190,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, offload_module_to_cpu=True, ) optimized_model.cuda() @@ -246,7 +241,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=True, ) result_samples = [] diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py index 539c11a303..8b0ad9cc14 100644 --- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py +++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py @@ -226,7 +226,6 @@ def test_lazy_engine_init_py_e2e(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": True, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -266,7 +265,6 @@ def test_lazy_engine_init_cpp_e2e(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -306,7 +304,6 @@ def test_lazy_engine_init_cpp_serialization(self): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -355,7 +352,6 @@ def forward(self, a, b): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": True, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, @@ -399,7 +395,6 @@ def forward(self, a, b): "min_block_size": 1, "ir": "dynamo", "lazy_engine_init": True, - "use_python_runtime": False, "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, diff --git a/tests/py/dynamo/runtime/test_003_safe_mode.py b/tests/py/dynamo/runtime/test_003_safe_mode.py index 0fde0773ed..d144725081 100644 --- a/tests/py/dynamo/runtime/test_003_safe_mode.py +++ b/tests/py/dynamo/runtime/test_003_safe_mode.py @@ -46,7 +46,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -90,7 +89,6 @@ def forward(self, x): inputs, min_block_size=1, pass_through_build_failures=True, - use_python_runtime=False, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() diff --git a/tests/py/dynamo/runtime/test_004_weight_streaming.py b/tests/py/dynamo/runtime/test_004_weight_streaming.py index 5954a7d4d4..146043ce2f 100644 --- a/tests/py/dynamo/runtime/test_004_weight_streaming.py +++ b/tests/py/dynamo/runtime/test_004_weight_streaming.py @@ -43,16 +43,17 @@ def test_weight_streaming_default(self, _, use_python_runtime): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) - optimized_model = torchtrt.dynamo.compile( - exp_program, - inputs=input, - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - use_python_runtime=use_python_runtime, - use_explicit_typing=True, - enable_weight_streaming=True, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.dynamo.compile( + exp_program, + inputs=input, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + use_explicit_typing=True, + enable_weight_streaming=True, + ) # Checking if default weight streaming budget(automatic) is applied when compiler option was provided weight_streaming_ctx = torchtrt.runtime.weight_streaming(optimized_model) assert weight_streaming_ctx.device_budget > 0 @@ -94,16 +95,17 @@ def test_weight_streaming_manual(self, _, use_python_runtime): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) - optimized_model = torchtrt.dynamo.compile( - exp_program, - inputs=input, - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - use_python_runtime=use_python_runtime, - use_explicit_typing=True, - enable_weight_streaming=True, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.dynamo.compile( + exp_program, + inputs=input, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + use_explicit_typing=True, + enable_weight_streaming=True, + ) # Weight streaming budget is applied manually. with torchtrt.runtime.weight_streaming(optimized_model) as weight_streaming_ctx: streamable_budget = weight_streaming_ctx.total_device_budget @@ -154,19 +156,20 @@ def test_weight_streaming_invalid_usage(self, _, use_python_runtime, multi_rt): model = SampleModel().eval().cuda() input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) - optimized_model = torchtrt.dynamo.compile( - exp_program, - inputs=input, - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - torch_executed_ops=( - {"torch.ops.aten.convolution.default"} if multi_rt else {} - ), - use_python_runtime=use_python_runtime, - use_explicit_typing=True, - enable_weight_streaming=True, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.dynamo.compile( + exp_program, + inputs=input, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + torch_executed_ops=( + {"torch.ops.aten.convolution.default"} if multi_rt else {} + ), + use_explicit_typing=True, + enable_weight_streaming=True, + ) # Setting weight streaming context to unsupported module with torchtrt.runtime.weight_streaming(model) as weight_streaming_ctx: @@ -202,17 +205,18 @@ def test_weight_streaming_multi_rt(self, _, use_python_runtime): input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) - optimized_model = torchtrt.dynamo.compile( - exp_program, - inputs=input, - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - torch_executed_ops={"torch.ops.aten.convolution.default"}, - use_python_runtime=use_python_runtime, - use_explicit_typing=True, - enable_weight_streaming=True, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.dynamo.compile( + exp_program, + inputs=input, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + torch_executed_ops={"torch.ops.aten.convolution.default"}, + use_explicit_typing=True, + enable_weight_streaming=True, + ) with torchtrt.runtime.weight_streaming(optimized_model) as weight_streaming_ctx: streamable_budget = weight_streaming_ctx.total_device_budget @@ -247,17 +251,18 @@ def test_weight_streaming_cudagraphs(self, _, use_python_runtime): input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] exp_program = torch.export.export(model, tuple(input)) - optimized_model = torchtrt.dynamo.compile( - exp_program, - inputs=input, - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - torch_executed_ops={"torch.ops.aten.convolution.default"}, - use_python_runtime=use_python_runtime, - use_explicit_typing=True, - enable_weight_streaming=True, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.dynamo.compile( + exp_program, + inputs=input, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + torch_executed_ops={"torch.ops.aten.convolution.default"}, + use_explicit_typing=True, + enable_weight_streaming=True, + ) with torchtrt.runtime.enable_cudagraphs(optimized_model) as cudagraphs_module: with torchtrt.runtime.weight_streaming( @@ -356,13 +361,14 @@ def forward(self, x, b=None, c=None, d=None, e=[]): "use_explicit_typing": True, "enable_weight_streaming": True, "torch_executed_ops": {"torch.ops.aten.mul.Tensor"}, - "use_python_runtime": use_python_runtime, } - exp_program = torchtrt.dynamo.trace(model, **compile_spec) - optimized_model = torchtrt.dynamo.compile( - exp_program, - **compile_spec, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + exp_program = torchtrt.dynamo.trace(model, **compile_spec) + optimized_model = torchtrt.dynamo.compile( + exp_program, + **compile_spec, + ) # List of tuples representing different configurations for three features: # Cuda graphs, pre-allocated output buffer, weight streaming change diff --git a/tests/py/dynamo/runtime/test_005_dynamic_allocation.py b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py index efdc13c284..0fa560a047 100644 --- a/tests/py/dynamo/runtime/test_005_dynamic_allocation.py +++ b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py @@ -29,7 +29,6 @@ def forward(self, x): settings = { "ir": "dynamo", - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, "lazy_engine_init": True, diff --git a/tests/py/dynamo/runtime/test_empty_input.py b/tests/py/dynamo/runtime/test_empty_input.py index 793eafb82c..00168d0c21 100644 --- a/tests/py/dynamo/runtime/test_empty_input.py +++ b/tests/py/dynamo/runtime/test_empty_input.py @@ -122,13 +122,14 @@ def test_concat_empty_with_nonempty( inputs = [non_empty_input] # Compile with Torch-TensorRT - compiled_model = torchtrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + compiled_model = torchtrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) # Run reference model ref_out = model(*inputs) @@ -162,13 +163,14 @@ def test_concat_nonempty_with_empty(self, _, use_python_runtime, empty_shape): empty_input = torch.empty(empty_shape, dtype=torch.float).cuda() inputs = [non_empty_input, empty_input] - compiled_model = torchtrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + compiled_model = torchtrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) ref_out = model(*inputs) trt_out = compiled_model(*inputs) @@ -206,13 +208,14 @@ def test_repeated_empty_tensor_no_leak_and_correct(self, _, use_python_runtime): non_empty_input = torch.randn((3, 4), dtype=torch.float).cuda() inputs = [empty_input, non_empty_input] - compiled_model = torchtrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + compiled_model = torchtrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) # Record initial GPU memory torch.cuda.synchronize() diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py index b2caa2551b..2a448c57b7 100644 --- a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py +++ b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py @@ -221,7 +221,6 @@ def test_resnet18(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, } @@ -268,7 +267,6 @@ def test_save(): # Compile the module for the first time and save it. # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ compile_spec = { - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, } @@ -308,7 +306,6 @@ def test_resnet18_modify_attribute(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, } @@ -353,7 +350,6 @@ def test_resnet18_modify_attribute_no_refit(): inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] compile_spec = { - "use_python_runtime": False, "enabled_precisions": {torch.float32}, "immutable_weights": False, } diff --git a/tests/py/dynamo/runtime/test_output_allocator.py b/tests/py/dynamo/runtime/test_output_allocator.py index c94020705c..d1755b4017 100644 --- a/tests/py/dynamo/runtime/test_output_allocator.py +++ b/tests/py/dynamo/runtime/test_output_allocator.py @@ -57,13 +57,14 @@ class TestOutputAllocatorStaticModel(TestCase): def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): model = StaticModel().eval().cuda() inputs = [torch.randn((2, 3), dtype=torch.float).cuda()] - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) ref_out = model(*inputs) @@ -101,13 +102,14 @@ def test_default(self, _, use_python_runtime): """ model = StaticModel().eval().cuda() inputs = [torch.randn((2, 3), dtype=torch.float).cuda()] - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) standard_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -127,13 +129,14 @@ def test_default(self, _, use_python_runtime): def test_combination_of_cg_and_oa(self, _, use_python_runtime): model = StaticModel().eval().cuda() inputs = [torch.randn((2, 3), dtype=torch.float).cuda()] - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) with pytest.raises( RuntimeError, @@ -170,13 +173,14 @@ class TestOutputAllocatorDDSModel(TestCase): def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): model = DDSModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) with pytest.raises( RuntimeError, @@ -211,13 +215,14 @@ def test_default(self, _, use_python_runtime): """ model = DDSModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) oa_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -237,13 +242,14 @@ def test_default(self, _, use_python_runtime): def test_combination_of_cg_and_oa(self, _, use_python_runtime): model = DDSModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) with pytest.raises( RuntimeError, @@ -284,13 +290,14 @@ class TestOutputAllocatorDDSOpWithReductionOpModel(TestCase): def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): model = DDSOpWithReductionOpModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) with pytest.raises( RuntimeError, @@ -325,13 +332,14 @@ def test_default(self, _, use_python_runtime): """ model = DDSOpWithReductionOpModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) oa_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -351,13 +359,14 @@ def test_default(self, _, use_python_runtime): def test_combination_of_cg_and_oa(self, _, use_python_runtime): model = DDSOpWithReductionOpModel().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + ) with pytest.raises( RuntimeError, @@ -394,14 +403,15 @@ class TestOutputAllocatorDDSModelWithGraphBreak(TestCase): def test_cudagraphs_and_output_allocator(self, _, use_python_runtime): model = DDSModel2().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - torch_executed_ops={"torch.ops.aten.abs.default"}, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + torch_executed_ops={"torch.ops.aten.abs.default"}, + ) with pytest.raises( RuntimeError, @@ -436,14 +446,15 @@ def test_default(self, _, use_python_runtime): """ model = DDSModel2().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - torch_executed_ops={"torch.ops.aten.abs.default"}, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + torch_executed_ops={"torch.ops.aten.abs.default"}, + ) oa_out = compiled_model(*inputs) ref_out = model(*inputs) @@ -463,14 +474,15 @@ def test_default(self, _, use_python_runtime): def test_combination_of_cg_and_oa(self, _, use_python_runtime): model = DDSModel2().eval().cuda() inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),) - compiled_model = torch_tensorrt.compile( - model, - "dynamo", - inputs, - min_block_size=1, - use_python_runtime=use_python_runtime, - torch_executed_ops={"torch.ops.aten.abs.default"}, - ) + backend = "python" if use_python_runtime else "cpp" + with torch_tensorrt.runtime.set_runtime_backend(backend): + compiled_model = torch_tensorrt.compile( + model, + "dynamo", + inputs, + min_block_size=1, + torch_executed_ops={"torch.ops.aten.abs.default"}, + ) with pytest.raises( RuntimeError, diff --git a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py index a9f8cfbbe5..88a85101af 100644 --- a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py +++ b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py @@ -24,14 +24,15 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "torch_compile", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "torch_compile", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + ) ref_out_list = [] trt_out_list = [] @@ -74,15 +75,16 @@ def forward(self, x): ) fx_graph = torch.fx.symbolic_trace(SampleModel()) - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs, - min_block_size=1, - pass_through_build_failures=True, - torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=use_python_runtime, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs, + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={"torch.ops.aten.mul.Tensor"}, + ) input_list = [] ref_out_list = [] @@ -135,15 +137,15 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=True, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + with torchtrt.runtime.set_runtime_backend("python"): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): _ = optimized_model(inputs[0]) @@ -182,15 +184,16 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=use_python_runtime, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): _ = optimized_model(inputs[0]) @@ -223,15 +226,16 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=use_python_runtime, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) torch_res = model(inputs[0]) @@ -276,15 +280,15 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=True, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + with torchtrt.runtime.set_runtime_backend("python"): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): res1 = optimized_model(inputs[0]) @@ -327,15 +331,16 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=use_python_runtime, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): _ = optimized_model(inputs[0]) @@ -373,15 +378,16 @@ def forward(self, x): fx_graph = torch.fx.symbolic_trace(model) # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs[0], - min_block_size=1, - pass_through_build_failures=True, - use_python_runtime=use_python_runtime, - torch_executed_ops={torch.ops.aten.add.Tensor}, - ) + backend = "python" if use_python_runtime else "cpp" + with torchtrt.runtime.set_runtime_backend(backend): + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={torch.ops.aten.add.Tensor}, + ) with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): res_1 = optimized_model(inputs[0]) From 5b1bde79d64bdf9ad0f153ff91a6f2e18306cadb Mon Sep 17 00:00:00 2001 From: cehongwang Date: Mon, 6 Apr 2026 20:24:03 +0000 Subject: [PATCH 2/4] Merged two operator and fixed some comments --- .../tutorials/runtime_opt/python_runtime.rst | 6 +- py/torch_tensorrt/dynamo/_compiler.py | 8 +- py/torch_tensorrt/dynamo/_refit.py | 8 +- .../runtime/_MutableTorchTensorRTModule.py | 14 +- .../dynamo/runtime/_PythonTRTEngine.py | 87 ++++-- .../runtime/_RuntimeBackendSelection.py | 91 ------ .../dynamo/runtime/_TorchTensorRTModule.py | 258 +++++++----------- .../runtime/meta_ops/register_meta_ops.py | 25 -- py/torch_tensorrt/dynamo/utils.py | 13 +- py/torch_tensorrt/runtime/__init__.py | 5 - 10 files changed, 181 insertions(+), 334 deletions(-) delete mode 100644 py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py diff --git a/docsrc/tutorials/runtime_opt/python_runtime.rst b/docsrc/tutorials/runtime_opt/python_runtime.rst index f07989971f..1aa0fae534 100644 --- a/docsrc/tutorials/runtime_opt/python_runtime.rst +++ b/docsrc/tutorials/runtime_opt/python_runtime.rst @@ -7,11 +7,11 @@ Torch-TensorRT uses a single module type, :class:`~torch_tensorrt.runtime.TorchT to run TensorRT engines inside PyTorch. The **execution path** (which code actually drives ``execute_async``) is selected at runtime: -* **C++ path** — ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. +* **C++ path (default)** — ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. Preferred for production when the Torch-TensorRT C++ extension is available: TorchScript-friendly, and integrates with the full C++ runtime stack. -* **Python path** — internal ``PythonTRTEngine`` plus - ``torch.ops.tensorrt.execute_engine_python``. Useful when the C++ extension is absent, or when +* **Python path** — When the C++ runtime is absent, use the internal ``TRTEngine`` plus + ``torch.ops.tensorrt.execute_engine`` (registered from Python when the C++ runtime is absent). Useful when the C++ extension is absent, or when you want easier Python-level debugging and instrumentation. :class:`~torch_tensorrt.runtime.PythonTorchTensorRTModule` is a **thin subclass** of diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 846724ddef..d4e5492465 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -12,7 +12,7 @@ from torch.fx.node import Target from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype -from torch_tensorrt._features import needs_cross_compile +from torch_tensorrt._features import ENABLED_FEATURES, needs_cross_compile from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults, partitioning from torch_tensorrt.dynamo._DryRunTracker import ( @@ -43,10 +43,6 @@ from torch_tensorrt.dynamo.partitioning._resource_partitioner import ( resource_partition, ) -from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( - RuntimeBackend, - get_runtime_backend, -) from torch_tensorrt.dynamo.utils import ( deallocate_module, get_cpu_memory_usage, @@ -1054,7 +1050,7 @@ def preserve_module_specs( if _debugger_config: if _debugger_config.save_engine_profile: - if get_runtime_backend() is RuntimeBackend.PYTHON: + if not ENABLED_FEATURES.torch_tensorrt_runtime: if _debugger_config.profile_format != "cudagraph": raise ValueError( "Profiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization." diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 1eed6edc80..f7bbc34c9d 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -32,7 +32,7 @@ post_lowering, pre_export_lowering, ) -from torch_tensorrt.dynamo.runtime._PythonTRTEngine import PythonTRTEngine +from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( ENGINE_IDX, SERIALIZED_METADATA_IDX, @@ -477,7 +477,7 @@ def refit_module_weights( # Torch retrace module assert not isinstance( compiled_submodule.engine, - PythonTRTEngine, + TRTEngine, ), ( "Refitting a torch retraced module is only supported when " "the engine uses the C++ Torch-TensorRT runtime" @@ -505,7 +505,7 @@ def refit_module_weights( # Rexporting the TRT compiled graph module and loading it back doesn't preserve # the instance type; choose the engine handle based on the actual engine object. - if isinstance(compiled_submodule.engine, PythonTRTEngine): + if isinstance(compiled_submodule.engine, TRTEngine): engine = compiled_submodule.engine.cuda_engine else: engine_info = compiled_submodule.engine.__getstate__()[0] @@ -568,7 +568,7 @@ def refit_module_weights( if compiled_submodule._is_python_runtime: # Refit already updated ``cuda_engine`` in place; avoid deserialize (slow). py_eng = compiled_submodule.engine - if isinstance(py_eng, PythonTRTEngine): + if isinstance(py_eng, TRTEngine): py_eng.serialized_info[ENGINE_IDX] = new_serialized_engine py_eng.serialized_engine = new_serialized_engine else: diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index 374c02cc8c..b0c5c2f325 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -14,7 +14,6 @@ from torch_tensorrt.dynamo import _defaults from torch_tensorrt.dynamo._compiler import compile as dynamo_compile from torch_tensorrt.dynamo._refit import refit_module_weights -from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import RuntimeBackend from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule from torch_tensorrt.dynamo.utils import ( check_output_equal, @@ -699,13 +698,12 @@ def resursivly_deserialize_dynamic_shape(obj: Any) -> None: @staticmethod def _compiled_graph_uses_python_runtime(gm: Any) -> bool: - for m in gm.modules(): - if ( - isinstance(m, TorchTensorRTModule) - and m._runtime_backend is RuntimeBackend.PYTHON - ): - return True - return False + from torch_tensorrt._features import ENABLED_FEATURES + + return bool( + any(isinstance(m, TorchTensorRTModule) for m in gm.modules()) + and not ENABLED_FEATURES.torch_tensorrt_runtime + ) @staticmethod def save(module: Any, path: str) -> None: diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py index 5d53e0d394..15f8047bc1 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py @@ -1,8 +1,9 @@ -"""Python-side TensorRT engine: deserialize, execute, and drive ``execute_engine_python``. +"""Python-side TensorRT engine: deserialize and execute TRT engines without the C++ runtime. Serialization layout lives in :mod:`torch_tensorrt.dynamo.runtime._serialized_engine_layout`. -The engine is passed into ``tensorrt::execute_engine_python`` as an opaque reference (see -``register_opaque_type``), analogous to ``tensorrt::Engine`` for the C++ ``execute_engine`` op. +When the C++ Torch-TensorRT runtime is unavailable, :class:`TRTEngine` is registered as an +opaque type and ``tensorrt::execute_engine`` is registered as a Python custom op so that the +same compiled graph can run on either the C++ or Python runtime transparently. """ from __future__ import annotations @@ -127,12 +128,19 @@ def set_runtime_states( # --------------------------------------------------------------------------- -# PythonTRTEngine +# TRTEngine (Python implementation) # --------------------------------------------------------------------------- -class PythonTRTEngine: - """TensorRT engine + execution context, driven from Python TRT APIs.""" +class TRTEngine: + """TensorRT engine + execution context, driven from Python TRT APIs. + + Exposes the same surface as the C++ ``torch.classes.tensorrt.Engine`` TorchBind + class so that :class:`~torch_tensorrt.dynamo.runtime.TorchTensorRTModule` can use + either implementation without branching. When the C++ runtime is unavailable this + class is registered as an opaque type and ``tensorrt::execute_engine`` is registered + as a Python custom op pointing to :func:`execute_engine`. + """ # --- construction / teardown --- @@ -166,7 +174,10 @@ def __init__( self._load_serialized_info(serialized_info) self._setup_engine() - def __deepcopy__(self, memo: dict[int, Any]) -> PythonTRTEngine: + def __del__(self) -> None: + self.reset_captured_graph() + + def __deepcopy__(self, memo: dict[int, Any]) -> "TRTEngine": """Rebuild from serialized layout so ``copy.deepcopy`` skips unpickleable TRT handles.""" if id(self) in memo: return memo[id(self)] # type: ignore @@ -175,6 +186,21 @@ def __deepcopy__(self, memo: dict[int, Any]) -> PythonTRTEngine: memo[id(self)] = dup return dup + def __str__(self) -> str: + return f"TRTEngine(name={self.name}, device={self.serialized_device_info})" + + def __repr__(self) -> str: + return self.__str__() + + def __getstate__(self) -> List[Any]: + """Return serialized engine info list for ``torch.save`` / pickling.""" + return list(self.serialized_info) + + def __setstate__(self, serialized_info: List[Any]) -> None: + """Restore engine from serialized info list (``torch.load`` / unpickling).""" + self._load_serialized_info(serialized_info) + self._setup_engine() + def tracing_mode(self) -> str: """Return ``"real"`` so FakeTensor/export pass the real engine into meta kernels. @@ -243,6 +269,7 @@ def get_serialized_metadata(self) -> str: return self.serialized_metadata def close(self) -> None: + """Release CUDA graph resources (called explicitly or via __del__).""" self.reset_captured_graph() def _create_execution_context(self) -> trt.IExecutionContext: @@ -369,6 +396,20 @@ def get_engine_layer_info(self) -> str: def dump_engine_layer_info(self) -> None: print(self.get_engine_layer_info()) + def dump_engine_layer_info_to_file(self, path: str) -> None: + with open(path, "w") as f: + f.write(self.get_engine_layer_info()) + + def infer_outputs(self, input_shapes: List[Any]) -> List[Any]: + """Return output shapes inferred for the given input shapes.""" + results = [] + for i, input_name in enumerate(self.in_binding_names): + if i < len(input_shapes): + self.context.set_input_shape(input_name, tuple(input_shapes[i])) + for output_name in self.out_binding_names: + results.append(tuple(self.context.get_tensor_shape(output_name))) + return results + # --- tensor binding helpers --- def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: @@ -479,7 +520,7 @@ def _execute_standard( self._input_buffers = [None] * len(self.in_binding_names) self._output_buffers = [None] * len(self.out_binding_names) - with self._profile_section("PythonTRTEngine:ProcessInputs"): + with self._profile_section("TRTEngine:ProcessInputs"): self.setup_input_tensors( contiguous_inputs, torch_tensorrt.runtime.get_cudagraphs_mode(), @@ -492,7 +533,7 @@ def _execute_standard( f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior." ) - with self._profile_section("PythonTRTEngine:ProcessOutputs"): + with self._profile_section("TRTEngine:ProcessOutputs"): if can_use_pre_allocated_outputs: outputs = self.pre_allocated_outputs else: @@ -516,7 +557,7 @@ def _execute_standard( else: self.context.set_tensor_address(output_name, outputs[o].data_ptr()) - with self._profile_section("PythonTRTEngine:TensorRTRuntime"): + with self._profile_section("TRTEngine:TensorRTRuntime"): self._caller_stream = torch.cuda.current_stream() if ( self._engine_stream == torch.cuda.default_stream() @@ -579,10 +620,10 @@ def _execute_output_allocator( "incompatible runtime modes. Please disable one of the two." ) - with self._profile_section("PythonTRTEngine:ProcessInputs"): + with self._profile_section("TRTEngine:ProcessInputs"): self.setup_input_tensors(contiguous_inputs, False, False) - with self._profile_section("PythonTRTEngine:SetupOutputAllocator"): + with self._profile_section("TRTEngine:SetupOutputAllocator"): self.create_output_allocator() for output_name in self.out_binding_names: if not self.context.set_output_allocator( @@ -592,7 +633,7 @@ def _execute_output_allocator( f"Failed to set output allocator for {output_name}" ) - with self._profile_section("PythonTRTEngine:TensorRTRuntime"): + with self._profile_section("TRTEngine:TensorRTRuntime"): self._caller_stream = torch.cuda.current_stream() if ( self._engine_stream == torch.cuda.default_stream() @@ -655,14 +696,14 @@ def execute( return self._execute_standard(contiguous_inputs) -register_opaque_type(PythonTRTEngine, typ="reference") - +if not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: + register_opaque_type(TRTEngine, typ="reference") -@torch.library.custom_op( # type: ignore[misc] - "tensorrt::execute_engine_python", mutates_args=() -) -def execute_engine_python( - input_tensors: List[torch.Tensor], engine: PythonTRTEngine -) -> List[torch.Tensor]: - outputs = engine.execute(input_tensors) - return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) + @torch.library.custom_op( # type: ignore[misc] + "tensorrt::execute_engine", mutates_args=() + ) + def execute_engine( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + outputs = engine.execute(input_tensors) + return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) diff --git a/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py b/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py deleted file mode 100644 index 57a0849732..0000000000 --- a/py/torch_tensorrt/dynamo/runtime/_RuntimeBackendSelection.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import annotations - -import logging -from enum import Enum -from typing import Union - -import torch_tensorrt - -logger = logging.getLogger(__name__) - - -class RuntimeBackend(str, Enum): - """Which Torch-TensorRT engine execution stack to use.""" - - CPP = "cpp" - PYTHON = "python" - - -_RuntimeBackendArg = Union[RuntimeBackend, str] - - -def _default_runtime_backend() -> RuntimeBackend: - return ( - RuntimeBackend.CPP - if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime - else RuntimeBackend.PYTHON - ) - - -_RUNTIME_BACKEND: RuntimeBackend = _default_runtime_backend() - - -def _normalize_runtime_backend(backend: _RuntimeBackendArg) -> RuntimeBackend: - if isinstance(backend, RuntimeBackend): - if ( - backend is RuntimeBackend.CPP - and not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime - ): - raise RuntimeError( - "C++ Torch-TensorRT runtime is not available in this build" - ) - return backend - - normalized = backend.lower() - if normalized not in ("cpp", "python"): - raise ValueError(f"Unsupported runtime backend: {backend}") - member = RuntimeBackend(normalized) - if ( - member is RuntimeBackend.CPP - and not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime - ): - raise RuntimeError("C++ Torch-TensorRT runtime is not available in this build") - return member - - -def get_runtime_backend() -> RuntimeBackend: - """Return the process-wide default backend (``cpp`` or ``python``).""" - return _RUNTIME_BACKEND - - -class _RuntimeBackendContextManager: - def __init__(self, old_backend: RuntimeBackend) -> None: - self.old_backend = old_backend - - def __enter__(self) -> "_RuntimeBackendContextManager": - return self - - def __exit__(self, *args: object) -> None: - global _RUNTIME_BACKEND - _RUNTIME_BACKEND = self.old_backend - - -def set_runtime_backend(backend: _RuntimeBackendArg) -> _RuntimeBackendContextManager: - """Context manager: set global C++ vs Python engine path for unpinned modules. - - Use around compile and forward so :class:`~torch_tensorrt.runtime.TorchTensorRTModule` - picks up the intended backend when it is constructed: - - .. code-block:: python - - with torch_tensorrt.runtime.set_runtime_backend("python"): - trt_gm = torch_tensorrt.dynamo.compile(...) - - If the return value is not used with ``with``, the backend remains changed until you - call ``__exit__`` on the returned object (or enter another ``set_runtime_backend`` context). - """ - global _RUNTIME_BACKEND - old_backend = _RUNTIME_BACKEND - _RUNTIME_BACKEND = _normalize_runtime_backend(backend) - logger.info(f"Set Torch-TensorRT runtime backend to {_RUNTIME_BACKEND}") - return _RuntimeBackendContextManager(old_backend) diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index ffb37e15ac..8789d690a6 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -4,19 +4,13 @@ import copy import logging import pickle -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import Platform from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._settings import CompilationSettings -from torch_tensorrt.dynamo.runtime._PythonTRTEngine import PythonTRTEngine -from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( - RuntimeBackend, - _normalize_runtime_backend, - get_runtime_backend, -) from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( ABI_TARGET_IDX, ABI_VERSION, @@ -43,32 +37,19 @@ Optional[SerializedTensorRTEngineFmt], List[str], List[str], - Optional[str], -] -# Checkpoints written before the trailing ``runtime_backend`` slot used four elements. -_LegacyTorchTensorRTModuleExtraState = Tuple[ - str, Optional[SerializedTensorRTEngineFmt], List[str], List[str] -] -TorchTensorRTModuleExtraState = Union[ - SerializedTorchTensorRTModuleFmt, - _LegacyTorchTensorRTModuleExtraState, ] class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] """``nn.Module`` that runs a TensorRT engine inside PyTorch. - Execution uses either the C++ Torch-TensorRT runtime (``torch.classes.tensorrt.Engine``) - or the Python TRT stack (``tensorrt`` + ``execute_engine_python``), depending on - :func:`~torch_tensorrt.runtime.get_runtime_backend` (set via - :func:`~torch_tensorrt.runtime.set_runtime_backend` as a context manager for scoped - changes). The backend is read from :func:`~torch_tensorrt.runtime.get_runtime_backend` - when the module is constructed (and from checkpoint metadata on load). + When the C++ Torch-TensorRT runtime is available, execution uses + ``torch.classes.tensorrt.Engine`` and ``torch.ops.tensorrt.execute_engine``. + When only the Python runtime is available, a Python :class:`TRTEngine` is + registered under the same ``tensorrt::execute_engine`` op so that the same + compiled graph works with either runtime transparently. Supports ``torch.save`` / ``torch.load`` via ``get_extra_state`` / ``set_extra_state``. - Extra state is a 5-tuple; the last element is ``runtime_backend`` (enum value as - ``str``) when an engine is saved, or ``None`` when there is no engine. If the fifth - element is missing (legacy 4-tuple with an engine), the C++ backend is used. """ def __init__( @@ -108,7 +89,7 @@ def __init__( self.settings = copy.deepcopy(settings) self.weight_name_map = weight_name_map self.serialized_engine = serialized_engine - self.engine: Optional[Any] = None + self.engine = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources self.symbolic_shape_expressions = symbolic_shape_expressions @@ -117,7 +98,6 @@ def __init__( if not self.settings.enable_cross_compile_for_windows else Platform.WIN_X86_64 ) - self._runtime_backend = get_runtime_backend() self.profiling_enabled = False if ( @@ -127,21 +107,6 @@ def __init__( ): self.setup_engine() - def _require_engine(self) -> Any: - if self.engine is None: - raise RuntimeError("Engine has not been setup yet.") - return self.engine - - @property - def _is_python_runtime(self) -> bool: - return self._runtime_backend is RuntimeBackend.PYTHON - - def _cleanup_engine(self) -> None: - engine = getattr(self, "engine", None) - if engine is not None and hasattr(engine, "close"): - engine.close() - self.engine = None - def _pack_engine_info(self) -> List[str | bytes]: target_device = ( self.settings.device @@ -200,34 +165,33 @@ def _pack_engine_info(self) -> List[str | bytes]: return engine_info def get_streamable_device_memory_budget(self) -> Any: - return self._require_engine().streamable_device_memory_budget + return self.engine.streamable_device_memory_budget def get_automatic_device_memory_budget(self) -> Any: - return self._require_engine().automatic_device_memory_budget + return self.engine.automatic_device_memory_budget def get_device_memory_budget(self) -> Any: - return self._require_engine().device_memory_budget + return self.engine.device_memory_budget def set_device_memory_budget(self, budget_bytes: int) -> int: - engine = self._require_engine() if budget_bytes < 0: budget_bytes = self.get_streamable_device_memory_budget() - engine.device_memory_budget = budget_bytes - if engine.device_memory_budget != budget_bytes: + self.engine.device_memory_budget = budget_bytes + if self.engine.device_memory_budget != budget_bytes: logger.error(f"Failed to set weight streaming budget to {budget_bytes}") - budget_bytes = engine.device_memory_budget + budget_bytes = self.engine.device_memory_budget if self.get_streamable_device_memory_budget() == budget_bytes: logger.warning("Weight streaming is disabled") return budget_bytes def _reset_captured_graph(self) -> None: - self._require_engine().reset_captured_graph() + self.engine.reset_captured_graph() def use_dynamically_allocated_resources( self, dynamically_allocate_resources: bool = False ) -> None: self.dynamically_allocate_resources = dynamically_allocate_resources - self._require_engine().use_dynamically_allocated_resources( + self.engine.use_dynamically_allocated_resources( self.dynamically_allocate_resources ) @@ -243,16 +207,15 @@ def setup_engine(self) -> None: if self.engine is not None: return - if self._is_python_runtime: - self.engine = PythonTRTEngine( + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + else: + from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine + + self.engine = TRTEngine( self._pack_engine_info(), profile_execution=self.profiling_enabled, - ) - return - - if not ENABLED_FEATURES.torch_tensorrt_runtime: - raise NotImplementedError("Torch-TensorRT Runtime is not available") - self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + ) # type: ignore[assignment] def encode_metadata(self, metadata: Any) -> str: metadata = copy.deepcopy(metadata) @@ -261,120 +224,85 @@ def encode_metadata(self, metadata: Any) -> str: return encoded_metadata @staticmethod - def decode_metadata(encoded_metadata: bytes | str) -> Any: - if isinstance(encoded_metadata, str): - encoded_metadata = encoded_metadata.encode("utf-8") - return pickle.loads(base64.b64decode(encoded_metadata)) + def decode_metadata(encoded_metadata: bytes) -> Any: + dumped_metadata = base64.b64decode(encoded_metadata.encode("utf-8")) + metadata = pickle.loads(dumped_metadata) + return metadata def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: - """Return payload for ``torch.save`` (engine blob base64-encoded in the packed list).""" - if self.engine or self.serialized_engine: + if self.engine: engine_info = self._pack_engine_info() - raw_engine_blob = engine_info[ENGINE_IDX] - assert isinstance(raw_engine_blob, (bytes, bytearray)) - engine_info[ENGINE_IDX] = base64.b64encode(raw_engine_blob) + assert isinstance(engine_info[ENGINE_IDX], (bytes, bytearray)) + engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) return ( self.name, engine_info, self.input_binding_names, self.output_binding_names, - self._runtime_backend.value, ) - return ( - self.name, - None, - self.input_binding_names, - self.output_binding_names, - None, - ) + elif self.serialized_engine: + engine_info = self._pack_engine_info() + assert isinstance(engine_info[ENGINE_IDX], bytes) + engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) # type: ignore[arg-type] + return ( + self.name, + engine_info, + self.input_binding_names, + self.output_binding_names, + ) + else: + return ( + self.name, + None, + self.input_binding_names, + self.output_binding_names, + ) - def set_extra_state(self, state: TorchTensorRTModuleExtraState) -> None: - """Restore module fields and engine from ``torch.load`` extra state.""" - self._cleanup_engine() + def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.name = state[0] - self.input_binding_names = state[2] - self.output_binding_names = state[3] - if len(state) not in (4, 5): - raise ValueError( - "Invalid TorchTensorRTModule extra_state: expected 4 (legacy) or 5 " - f"elements when engine_info is None, got {len(state)}" + + if state[1] is not None: + serialized_engine_info: SerializedTensorRTEngineFmt = list(state[1]) + serialized_engine_info[ENGINE_IDX] = base64.b64decode( + serialized_engine_info[ENGINE_IDX] + ) + self.hardware_compatible = bool( + int(serialized_engine_info[HW_COMPATIBLE_IDX]) + ) + self.requires_output_allocator = bool( + int(serialized_engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) ) - if state[1] is None: - self.serialized_engine = None - self.settings = CompilationSettings() - self.weight_name_map = None - self.hardware_compatible = False - self.requires_output_allocator = False - self.dynamically_allocate_resources = False - self.symbolic_shape_expressions = None - self.target_platform = Platform.current_platform() - self.profiling_enabled = False - return + serialized_metadata = serialized_engine_info[SERIALIZED_METADATA_IDX] + assert isinstance(serialized_metadata, bytes) + metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) + self.settings = metadata["settings"] + self.weight_name_map = metadata["weight_name_map"] + self.symbolic_shape_expressions = metadata["inout_symexprs"] - serialized_engine_info: SerializedTensorRTEngineFmt = list(state[1]) - metadata = TorchTensorRTModule.decode_metadata( - serialized_engine_info[SERIALIZED_METADATA_IDX] - ) - raw_backend = state[4] if len(state) == 5 else None - if raw_backend is None: - raw_backend = RuntimeBackend.CPP - runtime_backend = _normalize_runtime_backend(raw_backend) - self._runtime_backend = runtime_backend - - encoded_engine = serialized_engine_info[ENGINE_IDX] - decoded_engine = base64.b64decode(encoded_engine) - serialized_engine_info[ENGINE_IDX] = decoded_engine - self.serialized_engine = decoded_engine - self.hardware_compatible = bool(int(serialized_engine_info[HW_COMPATIBLE_IDX])) - self.requires_output_allocator = bool( - int(serialized_engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]) - ) - self.dynamically_allocate_resources = bool( - int(serialized_engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX]) - ) - self.settings = metadata["settings"] - self.weight_name_map = metadata["weight_name_map"] - self.symbolic_shape_expressions = metadata["inout_symexprs"] - self.target_platform = ( - Platform.WIN_X86_64 - if self.settings.enable_cross_compile_for_windows - else Platform.current_platform() - ) - self.profiling_enabled = False + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + else: + from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine - if runtime_backend is RuntimeBackend.PYTHON: - self.engine = PythonTRTEngine(serialized_engine_info) - else: - if not ENABLED_FEATURES.torch_tensorrt_runtime: - raise NotImplementedError("Torch-TensorRT Runtime is not available") - self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + self.engine = TRTEngine(serialized_engine_info) # type: ignore[assignment] - self.engine.set_output_tensors_as_unowned( - metadata["output_tensors_are_unowned"] - ) + self.engine.set_output_tensors_as_unowned( + metadata["output_tensors_are_unowned"] + ) + else: + self.engine = None + self.settings = CompilationSettings() + self.hardware_compatible = False - def __del__(self) -> None: - self._cleanup_engine() + self.input_binding_names = state[2] + self.output_binding_names = state[3] def set_pre_allocated_outputs(self, enable: bool) -> None: - self._require_engine().use_pre_allocated_outputs = enable + self.engine.use_pre_allocated_outputs = enable def set_use_output_allocator(self, enable: bool) -> None: - self._require_engine().use_output_allocator_outputs = enable - - def _execute_engine(self, input_tensors: List[torch.Tensor]) -> List[torch.Tensor]: - """Dispatch to ``execute_engine`` or ``execute_engine_python``.""" - engine = self._require_engine() - if self._is_python_runtime: - return cast( - List[torch.Tensor], - torch.ops.tensorrt.execute_engine_python(list(input_tensors), engine), - ) - return cast( - List[torch.Tensor], - torch.ops.tensorrt.execute_engine(list(input_tensors), engine), - ) + self.engine.use_output_allocator_outputs = enable def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: """Run the TensorRT engine on GPU tensors (non-tensor args are cast to CUDA tensors).""" @@ -389,7 +317,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: (value if isinstance(value, torch.Tensor) else torch.tensor(value).cuda()) for value in inputs ] - outputs = self._execute_engine(input_tensors) + outputs = list(torch.ops.tensorrt.execute_engine(input_tensors, self.engine)) if len(outputs) == 1: return outputs[0] return tuple(outputs) @@ -399,11 +327,11 @@ def enable_profiling( profiling_results_dir: Optional[str] = None, profile_format: str = "perfetto", ) -> None: - """Enable engine profiling (C++: optional Perfetto/TREx path prefix on disk).""" + """Enable engine profiling (optional path prefix and format for tracing output).""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") - if not self._is_python_runtime and profiling_results_dir is not None: + if profiling_results_dir is not None: self.engine.profile_path_prefix = profiling_results_dir self.engine.enable_profiling() @@ -412,10 +340,10 @@ def enable_profiling( self.profiling_enabled = True def set_output_tensors_as_unowned(self, enabled: bool) -> None: - self._require_engine().set_output_tensors_as_unowned(enabled) + self.engine.set_output_tensors_as_unowned(enabled) def are_output_tensors_unowned(self) -> bool: - return cast(bool, self._require_engine().are_output_tensors_unowned()) + return bool(self.engine.are_output_tensors_unowned()) def disable_profiling(self) -> None: """Disable engine profiling and clear the profiling flag on this module.""" @@ -425,11 +353,21 @@ def disable_profiling(self) -> None: self.profiling_enabled = False def get_layer_info(self) -> str: - """Return TRT layer information as a JSON string (TRT version dependent).""" - return cast(str, self._require_engine().get_engine_layer_info()) + """Get a JSON string containing the layer information encoded by the TensorRT engine in this module + + Returns: + + str: A JSON string which contains the layer information of the engine incapsulated in this module + """ + if self.engine is None: + raise RuntimeError("Engine has not been initialized yet.") + + layer_info: str = self.engine.get_engine_layer_info() + return layer_info def dump_layer_info(self) -> None: - """Print layer information for this engine to stdout.""" + """Dump layer information encoded by the TensorRT engine in this module to STDOUT""" if self.engine is None: raise RuntimeError("Engine has not been initialized yet.") + self.engine.dump_engine_layer_info() diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index 618d743a7e..2364b83dd9 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -260,31 +260,6 @@ def fake_tensorrt_execute_engine( ) -@torch.library.register_fake("tensorrt::execute_engine_python") # type: ignore -def fake_tensorrt_execute_engine_python(inputs: List[torch.Tensor], engine: Any) -> Any: - shape_info = _shape_info_from_trt_engine(engine) - - if shape_info: - return _apply_symbolic_shape_expressions(inputs, shape_info) - - real = getattr(engine, "real_obj", None) - for o in (engine, real): - if o is None: - continue - shapes, dtypes = getattr(o, "output_shapes", None), getattr( - o, "output_dtypes", None - ) - if shapes and dtypes: - return [ - torch.empty(s, dtype=d, device=inputs[0].device) - for s, d in zip(shapes, dtypes) - ] - - raise RuntimeError( - "No output shape information found for tensorrt::execute_engine_python." - ) - - @torch._library.register_fake_class("tensorrt::Engine") class FakeTRTEngine: def __init__(self, engine_info: List[str]) -> None: diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index f563d5fc44..403e4544b6 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -140,16 +140,11 @@ def deallocate_module(module: torch.fx.GraphModule) -> None: def _log_torch_compile_runtime_backend() -> None: """Log which TRT runtime backend applies for a ``torch.compile`` / Dynamo compile.""" - from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( - RuntimeBackend, - get_runtime_backend, - ) + from torch_tensorrt._features import ENABLED_FEATURES - backend = get_runtime_backend() - using_python = backend is RuntimeBackend.PYTHON + using_python = not ENABLED_FEATURES.torch_tensorrt_runtime logger.info( - f"Using {'Python-only' if using_python else 'Default'} Torch-TRT Runtime " - f"(from runtime backend selection: {backend})" + f"Using {'Python-only' if using_python else 'Default'} Torch-TRT Runtime" ) @@ -577,7 +572,7 @@ def parse_dynamo_kwargs( if "use_python_runtime" in kwargs: warnings.warn( 'torch.compile option "use_python_runtime" was removed; use ' - 'torch_tensorrt.runtime.set_runtime_backend("python"|"cpp") instead.', + "the Python runtime is now selected automatically when the C++ extension is unavailable.", DeprecationWarning, stacklevel=2, ) diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py index 4b9e5146bb..7283ca0f33 100644 --- a/py/torch_tensorrt/runtime/__init__.py +++ b/py/torch_tensorrt/runtime/__init__.py @@ -1,11 +1,6 @@ from torch_tensorrt.dynamo.runtime import ( # noqa: F401 TorchTensorRTModule, ) -from torch_tensorrt.dynamo.runtime._RuntimeBackendSelection import ( - RuntimeBackend, - get_runtime_backend, - set_runtime_backend, -) from torch_tensorrt.runtime._cudagraphs import ( enable_cudagraphs, get_cudagraphs_mode, From 17550014e85960c510860ceb4608ecc1474e9a53 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Wed, 8 Apr 2026 19:35:47 +0000 Subject: [PATCH 3/4] Enabled op agnostic serialization for both runtime --- py/torch_tensorrt/_compile.py | 32 ++++++++----- py/torch_tensorrt/_enums.py | 3 +- py/torch_tensorrt/dynamo/_exporter.py | 48 +++++++------------ .../dynamo/runtime/_PythonTRTEngine.py | 22 +++++++++ .../runtime/meta_ops/register_meta_ops.py | 4 +- 5 files changed, 65 insertions(+), 44 deletions(-) diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index c4dbb1c148..b77291798b 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -545,7 +545,7 @@ def convert_method_to_trt_engine( module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs ) - return dynamo_convert_exported_program_to_serialized_trt_engine( + return dynamo_convert_exported_program_to_serialized_trt_engine( # type: ignore[no-any-return] exp_program, arg_inputs=tuple(arg_inputs), kwarg_inputs=torchtrt_kwarg_inputs, @@ -594,35 +594,43 @@ def load( Raises: ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file """ + from torch_tensorrt.dynamo._exporter import replace_execute_engine_no_op_node try: - logger.debug(f"Loading the provided file {file_path} using torch.jit.load()") - ts_module = function_overload_with_kwargs( + logger.debug(f"Loading the provided file {file_path} using torch.export.load()") + exp_program = function_overload_with_kwargs( torch.export.load, file_path, extra_files=extra_files, **kwargs, ) - return ts_module + gm = exp_program.graph_module + if any( + "no_op_placeholder_for_execute_engine" in n.name for n in gm.graph.nodes + ): + return replace_execute_engine_no_op_node(exp_program) + return exp_program except Exception: + import traceback + + traceback.print_exc() logger.info( f"Loading the provided file {file_path} via torch.export.load() failed with the following error", exc_info=True, ) - pass try: - logger.debug(f"Loading the provided file {file_path} using torch.export.load()") - exp_program = function_overload_with_kwargs( + logger.debug(f"Loading the provided file {file_path} using torch.jit.load()") + ts_module = function_overload_with_kwargs( torch.jit.load, file_path, _extra_files=extra_files, **kwargs, ) - return exp_program - except Exception: + return ts_module + except Exception as e: logger.info( - f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error", + f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error: {e}", exc_info=True, ) raise ValueError( @@ -805,8 +813,8 @@ def _all_are_input_objects(obj: Any) -> bool: f"Inferred dynamic_shapes from torch_tensorrt.Input objects with min/opt/max specifications: {dynamic_shapes}" ) - arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device())) # type: ignore - kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device()) # type: ignore + arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device())) + kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device()) else: # Mixed case: some inputs are Tensors, some are Input objects diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py index 637843eaeb..26746ddfff 100644 --- a/py/torch_tensorrt/_enums.py +++ b/py/torch_tensorrt/_enums.py @@ -1402,7 +1402,8 @@ def current_platform(cls) -> Platform: return Platform.UNKNOWN def __str__(self) -> str: - return str(self.name) + # Make it compatible with C++ runtime + return self.name.lower() @needs_torch_tensorrt_runtime # type: ignore def _to_serialized_rt_platform(self) -> str: diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index b482a8c839..ba026cd248 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -19,6 +19,7 @@ OutputSpec, TensorArgument, ) +from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX, NAME_IDX @@ -483,36 +484,18 @@ def inline_trt_modules( f"trt_module_node: {trt_module_node.name} does not have the metadata which should be set during dynamo compile_module step." ) num_outputs = len(trt_module_node.meta["val"]) - # Insert a call_function node to perform inference on TRT engine with gm.graph.inserting_before(trt_module_node): - if cross_compile_module: - engine_info = trt_module._pack_engine_info() - engine_bytes = engine_info[ENGINE_IDX] - engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") - # insert the no_placeholder node in the graph which should be replaced to the actual execute_engine node while load in the windows - trt_node = gm.graph.call_function( - torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default, - (trt_module_node.args, *engine_info), - ) - else: - # for the normal workflow: use the execute_engine node - engine_name = f"{name}_engine" - # TODO: THROWS SOME WARNING ABOUT A LACK OF UNDERLYING REFERENCE TO THE OWNING GRAPH MODULE - # SAYS THERES 3 OPTIONS, SUBMODULE, PARAMETER, OR BUFFER, BUFFER SEEMS THE BEST BUT I THINK ITS KEYED TO TENSORS - setattr(gm, engine_name, trt_module.engine) - engine_node = gm.graph.get_attr(engine_name) - - trt_node = gm.graph.call_function( - torch.ops.tensorrt.execute_engine.default, - (trt_module_node.args, engine_node), - ) - # meta["val"] should be a lighter version of a tensor. For eg: it should be a FakeTensor (with output shape and dtype properties) - # Lighter version of a custom_obj is not defined clearly. meta["val"] does not have any type expectations but - # for custom object nodes, it should be CustomObjArgument - engine_node.meta["val"] = CustomObjArgument( - name=engine_node.name, class_fqn="" - ) - # set trt_node.meta with trt_module_node.meta + # Always embed engine data as primitive string args via no_op_placeholder + # so torch.export does not pickle torch.classes.tensorrt.Engine (which + # requires the C++ TorchBind class at load time). + # torch_tensorrt.load() lowers placeholders → execute_engine. + engine_info = trt_module._pack_engine_info() + engine_bytes = engine_info[ENGINE_IDX] + engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") + trt_node = gm.graph.call_function( + torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default, + (trt_module_node.args, *engine_info), + ) assert num_outputs > 0 trt_node.meta["val"] = trt_module_node.meta["val"] @@ -557,7 +540,12 @@ def replace_execute_engine_no_op_node( packed_engine_info[ENGINE_IDX] = base64.b64decode( engine_bytes.encode("utf-8") ) - trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info)) + if ENABLED_FEATURES.torch_tensorrt_runtime: + trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info)) + else: + from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine + + trt_engine = TRTEngine(packed_engine_info) setattr(gm, engine_name, trt_engine) engine_node = gm.graph.get_attr(engine_name) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py index 15f8047bc1..acc4cb7bcd 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py @@ -707,3 +707,25 @@ def execute_engine( ) -> List[torch.Tensor]: outputs = engine.execute(input_tensors) return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) + + @torch.library.custom_op( # type: ignore[misc] + "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=() + ) + def no_op_placeholder_for_execute_engine( + inputs: List[torch.Tensor], + abi_version: str, + name: str, + serialized_device_info: str, + serialized_engine: str, + serialized_in_binding_names: str, + serialized_out_binding_names: str, + serialized_hardware_compatible: str, + serialized_metadata: str, + serialized_target_platform: str, + serialized_require_output_allocator: str, + serialized_resource_allocation_strategy: str, + ) -> List[torch.Tensor]: + raise RuntimeError( + "TensorRT engine placeholder reached eager execution; load this artifact with " + "torch_tensorrt.load() so placeholders are lowered to execute_engine." + ) diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index 2364b83dd9..2cd550fe09 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -354,7 +354,9 @@ def no_op_placeholder_for_execute_engine( serialized_metadata: str, serialized_target_platform: str, serialized_require_output_allocator: str, + serialized_resource_allocation_strategy: str, ) -> List[torch.Tensor]: raise RuntimeError( - "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." + "TensorRT engine placeholder reached eager execution; load this artifact with " + "torch_tensorrt.load() so placeholders are lowered to execute_engine." ) From 65d604a7db44e9048d81d757550e15e269d05bb3 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Fri, 10 Apr 2026 02:06:04 +0000 Subject: [PATCH 4/4] First scratch of cross-serialization --- core/runtime/BUILD | 13 +- core/runtime/register_jit_hooks.cpp | 28 +++++ py/torch_tensorrt/_compile.py | 35 +++++- py/torch_tensorrt/dynamo/_exporter.py | 32 +++-- .../dynamo/runtime/_PythonTRTEngine.py | 112 +++++++++++++++++- .../runtime/meta_ops/register_meta_ops.py | 3 - third_party/libtorch/BUILD | 16 +++ toolchains/local_torch.bzl | 10 +- 8 files changed, 227 insertions(+), 22 deletions(-) diff --git a/core/runtime/BUILD b/core/runtime/BUILD index 19260149ae..5f5fc7fe65 100644 --- a/core/runtime/BUILD +++ b/core/runtime/BUILD @@ -1,6 +1,7 @@ load("@rules_cc//cc:defs.bzl", "cc_library") load("@rules_pkg//:pkg.bzl", "pkg_tar") load("@rules_pkg//pkg:mappings.bzl", "pkg_files") + package(default_visibility = ["//visibility:public"]) config_setting( @@ -58,6 +59,13 @@ config_setting( ], ) +config_setting( + name = "python_core", + values = { + "define": "target_lang=python", + }, +) + cc_library( name = "runtime", srcs = [ @@ -96,6 +104,9 @@ cc_library( ":use_torch_whl": ["@torch_whl//:libtorch"], ":windows": ["@libtorch_win//:libtorch"], "//conditions:default": ["@libtorch"], + }) + select({ + ":python_core": ["@libtorch//:pybind11"], + "//conditions:default": [], }), alwayslink = True, ) @@ -121,6 +132,6 @@ pkg_tar( pkg_files( name = "include_pkg_files", srcs = [":include_files"], - visibility = ["//visibility:public"], prefix = "include/torch_tensorrt/core/runtime/", + visibility = ["//visibility:public"], ) diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index e8f6217a21..fc5fbe2d25 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -1,9 +1,19 @@ #include +#include #include "core/runtime/Platform.h" #include "core/runtime/runtime.h" #include "core/util/macros.h" +namespace py = pybind11; + +namespace torch::jit { +struct OpaqueObject : public CustomClassHolder { + OpaqueObject(py::object payload) : payload_(std::move(payload)) {} + py::object payload_; +}; +} // namespace torch::jit + namespace torch_tensorrt { namespace core { namespace runtime { @@ -122,6 +132,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = TORCH_LIBRARY(tensorrt, m) { m.def("execute_engine(Tensor[] input_tensors, __torch__.torch.classes.tensorrt.Engine engine) -> Tensor[]"); + m.def("execute_engine.opaque(Tensor[] input_tensors, __torch__.torch.classes.aten.OpaqueObject engine) -> Tensor[]"); + m.def("_wrap_engine(__torch__.torch.classes.tensorrt.Engine engine) -> __torch__.torch.classes.aten.OpaqueObject"); m.def("SERIALIZED_ENGINE_BINDING_DELIM", []() -> std::string { return std::string(1, TRTEngine::BINDING_DELIM); }); m.def("SERIALIZED_RT_DEVICE_DELIM", []() -> std::string { return DEVICE_INFO_DELIM; }); m.def("ABI_VERSION", []() -> std::string { return ABI_VERSION; }); @@ -174,6 +186,22 @@ TORCH_LIBRARY(tensorrt, m) { TORCH_LIBRARY_IMPL(tensorrt, CompositeExplicitAutograd, m) { m.impl("execute_engine", execute_engine); + m.impl( + "execute_engine.opaque", + [](std::vector inputs, c10::intrusive_ptr opaque_engine) { + py::gil_scoped_acquire gil; + auto capsule = py::cast(opaque_engine->payload_); + auto* engine_ptr = static_cast*>(capsule.get_pointer()); + return execute_engine(std::move(inputs), *engine_ptr); + }); + m.impl("_wrap_engine", [](c10::intrusive_ptr engine) -> c10::intrusive_ptr { + py::gil_scoped_acquire gil; + auto* holder = new c10::intrusive_ptr(std::move(engine)); + py::capsule capsule(holder, "TRTEngine", [](PyObject* o) { + delete static_cast*>(PyCapsule_GetPointer(o, "TRTEngine")); + }); + return c10::make_intrusive(py::object(std::move(capsule))); + }); } } // namespace diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index b77291798b..cea4e84449 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -594,7 +594,10 @@ def load( Raises: ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file """ - from torch_tensorrt.dynamo._exporter import replace_execute_engine_no_op_node + # Ensure Python TRT engine ops are registered so torch.export.load can + # resolve tensorrt::execute_engine when the C++ runtime is absent. + if not ENABLED_FEATURES.torch_tensorrt_runtime: + import torch_tensorrt.dynamo.runtime._PythonTRTEngine # noqa: F401 try: logger.debug(f"Loading the provided file {file_path} using torch.export.load()") @@ -604,11 +607,17 @@ def load( extra_files=extra_files, **kwargs, ) + # Handle legacy cross-compiled archives that use no_op_placeholder nodes gm = exp_program.graph_module if any( "no_op_placeholder_for_execute_engine" in n.name for n in gm.graph.nodes ): + from torch_tensorrt.dynamo._exporter import ( + replace_execute_engine_no_op_node, + ) + return replace_execute_engine_no_op_node(exp_program) + return exp_program except Exception: import traceback @@ -895,6 +904,7 @@ def _extract_tensor(obj: Any) -> Any: logger.warning( "Provided model is a torch.export.ExportedProgram, inputs or arg_inputs is not necessary during save, it uses the inputs or arg_inputs provided during export and compile" ) + _normalize_engine_constants_to_python(module) if output_format == "exported_program": function_overload_with_kwargs( torch.export.save, @@ -952,6 +962,7 @@ def _extract_tensor(obj: Any) -> Any: dynamic_shapes=dynamic_shapes, use_legacy_exporter=_use_legacy, ) + _normalize_engine_constants_to_python(exp_program) if output_format == "exported_program": function_overload_with_kwargs( torch.export.save, @@ -1031,6 +1042,7 @@ def _extract_tensor(obj: Any) -> Any: strict=False, ) + _normalize_engine_constants_to_python(exp_program) if output_format == "exported_program": function_overload_with_kwargs( torch.export.save, @@ -1056,6 +1068,27 @@ def _extract_tensor(obj: Any) -> Any: ) +def _normalize_engine_constants_to_python(exp_program: "ExportedProgram") -> None: + """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. + + The C++ runtime stores engine constants as ``torch._C.ScriptObject`` + (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as + an opaque type so ``torch.export`` can serialise it with ``pickle``. By + converting before save the artifact is portable across both runtimes. + """ + import base64 + + from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine + from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX + + for fqn, constant in list(exp_program.constants.items()): + if isinstance(constant, torch._C.ScriptObject): + state = constant.__getstate__() + serialized_info = list(state[0]) + serialized_info[ENGINE_IDX] = base64.b64decode(serialized_info[ENGINE_IDX]) + exp_program.constants[fqn] = TRTEngine(serialized_info) + + def function_overload_with_kwargs( fn: Callable[..., Any], *args: Any, **kwargs: Any ) -> Any: diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index ba026cd248..f3cdefebe4 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -484,18 +484,28 @@ def inline_trt_modules( f"trt_module_node: {trt_module_node.name} does not have the metadata which should be set during dynamo compile_module step." ) num_outputs = len(trt_module_node.meta["val"]) + # Insert a call_function node to perform inference on TRT engine with gm.graph.inserting_before(trt_module_node): - # Always embed engine data as primitive string args via no_op_placeholder - # so torch.export does not pickle torch.classes.tensorrt.Engine (which - # requires the C++ TorchBind class at load time). - # torch_tensorrt.load() lowers placeholders → execute_engine. - engine_info = trt_module._pack_engine_info() - engine_bytes = engine_info[ENGINE_IDX] - engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") - trt_node = gm.graph.call_function( - torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default, - (trt_module_node.args, *engine_info), - ) + if cross_compile_module: + engine_info = trt_module._pack_engine_info() + engine_bytes = engine_info[ENGINE_IDX] + engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") + trt_node = gm.graph.call_function( + torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default, + (trt_module_node.args, *engine_info), + ) + else: + engine_name = f"{name}_engine" + setattr(gm, engine_name, trt_module.engine) + engine_node = gm.graph.get_attr(engine_name) + + trt_node = gm.graph.call_function( + torch.ops.tensorrt.execute_engine.default, + (trt_module_node.args, engine_node), + ) + engine_node.meta["val"] = CustomObjArgument( + name=engine_node.name, class_fqn="" + ) assert num_outputs > 0 trt_node.meta["val"] = trt_module_node.meta["val"] diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py index acc4cb7bcd..fc1e1bc789 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py @@ -21,6 +21,7 @@ import torch import torch_tensorrt from torch._library.opaque_object import register_opaque_type +from torch._opaque_base import OpaqueBase from torch_tensorrt._enums import dtype from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR from torch_tensorrt.dynamo._settings import CompilationSettings @@ -127,12 +128,38 @@ def set_runtime_states( ) +# --------------------------------------------------------------------------- +# Pickle reconstruction — returns the right engine type for the current runtime +# --------------------------------------------------------------------------- + + +def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: + """Reconstruct a TRT engine from its serialized info list. + + Called by pickle when deserializing a ``TRTEngine``. Checks which runtime + is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or + a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across + runtimes. + """ + serialized_info = list(serialized_info) + engine_field = serialized_info[ENGINE_IDX] + if isinstance(engine_field, str): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) + elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) + + if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: + return torch.classes.tensorrt.Engine(tuple(serialized_info)) + + return TRTEngine(serialized_info) + + # --------------------------------------------------------------------------- # TRTEngine (Python implementation) # --------------------------------------------------------------------------- -class TRTEngine: +class TRTEngine(OpaqueBase): """TensorRT engine + execution context, driven from Python TRT APIs. Exposes the same surface as the C++ ``torch.classes.tensorrt.Engine`` TorchBind @@ -192,12 +219,59 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__() + def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: + """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. + + The reconstruction function checks which runtime is available at + load time and returns either a C++ ``torch.classes.tensorrt.Engine`` + or a Python ``TRTEngine``, so a single saved artifact works on both. + """ + state = list(self.serialized_info) + state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") + return (_reconstruct_trt_engine, (state,)) + def __getstate__(self) -> List[Any]: - """Return serialized engine info list for ``torch.save`` / pickling.""" - return list(self.serialized_info) + """Return serialized engine info list for ``torch.save`` / pickling. + + The engine blob at ENGINE_IDX is base64-encoded so the format matches + the C++ ``TRTEngine::serialize()`` / ``def_pickle`` contract. + """ + state = list(self.serialized_info) + state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") + return state def __setstate__(self, serialized_info: List[Any]) -> None: - """Restore engine from serialized info list (``torch.load`` / unpickling).""" + """Restore engine from serialized info list (``torch.load`` / unpickling). + + Mirrors the C++ ``def_pickle`` __setstate__ which base64-decodes ENGINE_IDX. + """ + self._profile_execution = False + self.profile_path_prefix = tempfile.gettempdir() + self.use_pre_allocated_outputs = False + self.use_output_allocator_outputs = False + self.output_tensors_are_unowned = False + self.output_allocator: Optional[DynamicOutputAllocator] = None + self.pre_allocated_outputs: List[torch.Tensor] = [] + self._input_buffers: List[torch.Tensor] = [] + self._output_buffers: List[torch.Tensor] = [] + self._caller_stream: Optional[torch.cuda.Stream] = None + self._engine_stream: Optional[torch.cuda.Stream] = None + self.cudagraph: Optional[torch.cuda.CUDAGraph] = None + self.shape_key: Optional[str] = None + self._empty_tensor_placeholder: Optional[torch.Tensor] = None + self._dynamic_workspace: Optional[torch.Tensor] = None + self.runtime_states = TorchTRTRuntimeStates( + torch_tensorrt.runtime.get_cudagraphs_mode() + ) + self.resource_allocation_strategy = 0 + self._runtime_config = None + + serialized_info = list(serialized_info) + engine_field = serialized_info[ENGINE_IDX] + if isinstance(engine_field, str): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) + elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) self._load_serialized_info(serialized_info) self._setup_engine() @@ -696,8 +770,9 @@ def execute( return self._execute_standard(contiguous_inputs) +register_opaque_type(TRTEngine, typ="reference") + if not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: - register_opaque_type(TRTEngine, typ="reference") @torch.library.custom_op( # type: ignore[misc] "tensorrt::execute_engine", mutates_args=() @@ -708,6 +783,33 @@ def execute_engine( outputs = engine.execute(input_tensors) return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) + @execute_engine.register_fake # type: ignore[misc] + def execute_engine_fake( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + """Abstract/fake kernel for ``tensorrt::execute_engine``. + + Called by FakeTensor propagation and ``torch.export`` to infer output + shapes and dtypes without executing the real TRT engine. Output shapes + are obtained by asking the engine's execution context to propagate the + concrete input shapes symbolically; dtypes come from the engine's + pre-parsed output dtype list. + """ + input_shapes = [list(t.shape) for t in input_tensors] + try: + output_shapes = engine.infer_outputs(input_shapes) + except Exception: + # Fall back to the statically-stored shapes when shape inference is + # unavailable (e.g. engine context not yet initialised in meta mode). + output_shapes = [list(s) for s in engine.output_shapes] + + return [ + torch.empty( + shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device + ) + for i, shape in enumerate(output_shapes) + ] + @torch.library.custom_op( # type: ignore[misc] "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=() ) diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index 2cd550fe09..e855fb2a1d 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -245,12 +245,9 @@ def fake_tensorrt_execute_engine( Uses symbolic shape expressions captured at compile time to correctly infer output shapes while preserving symbolic SymInt relationships. """ - shape_info = _shape_info_from_trt_engine(fake_trt_engine) if shape_info: - # Apply the symbolic shape expressions to create output fake tensors - # shape_info now contains both 'inputs' and 'outputs' keys return _apply_symbolic_shape_expressions(inputs, shape_info) else: raise RuntimeError( diff --git a/third_party/libtorch/BUILD b/third_party/libtorch/BUILD index 37309f7209..72ad612015 100644 --- a/third_party/libtorch/BUILD +++ b/third_party/libtorch/BUILD @@ -87,6 +87,22 @@ cc_library( strip_include_prefix = "include", ) +cc_library( + name = "pybind11", + hdrs = glob( + ["include/pybind11/**/*.h"], + allow_empty = True, + ) + glob( + ["python_include/**/*.h"], + allow_empty = True, + ), + includes = [ + "include", + "python_include", + ], + deps = [":torch"], +) + cc_library( name = "caffe2", srcs = select({ diff --git a/toolchains/local_torch.bzl b/toolchains/local_torch.bzl index 52eb641c93..88d98438cb 100644 --- a/toolchains/local_torch.bzl +++ b/toolchains/local_torch.bzl @@ -55,9 +55,9 @@ def _find_python(ctx): def _local_torch_impl(ctx): # 1. Env-var override (takes priority over auto-detection) torch_dir = ctx.os.environ.get("TORCH_PATH", "").strip() + python = _find_python(ctx) if not torch_dir: - python = _find_python(ctx) if not python: fail( "Cannot locate a Python interpreter that has torch installed. " + @@ -82,6 +82,14 @@ def _local_torch_impl(ctx): if child.exists: ctx.symlink(child, sub) + # Symlink Python headers so pybind11 can find Python.h + if python: + result = ctx.execute([python, "-c", "import sysconfig; print(sysconfig.get_path('include'))"]) + if result.return_code == 0: + python_inc = ctx.path(result.stdout.strip()) + if python_inc.exists: + ctx.symlink(python_inc, "python_include") + ctx.file("BUILD", ctx.read(Label("@//third_party/libtorch:BUILD"))) local_torch = repository_rule(