diff --git a/MODULE.bazel b/MODULE.bazel
index 50381160aa..bcbb1efa1c 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -26,6 +26,13 @@ new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.
 
 local_torch = use_repo_rule("//toolchains:local_torch.bzl", "local_torch")
 
+local_executorch = use_repo_rule("//toolchains:local_executorch.bzl", "local_executorch")
+
+# Detect the locally installed ExecuTorch source tree at build time.
+# Set EXECUTORCH_PATH to the directory containing runtime/, extension/, cmake-out/.
+# Requires cmake-out/libexecutorch_core.a to be built first.
+local_executorch(name = "executorch")
+
 # External dependency for torch_tensorrt if you already have precompiled binaries.
 new_local_repository(
     name = "torch_tensorrt",
@@ -77,6 +84,15 @@ local_torch(name = "libtorch")
 #    build_file = "third_party/libtorch/BUILD"
 #)
 
+# ExecuTorch source tree.  The repository root is the *parent* of the
+# executorch/ directory so that headers resolve as <executorch/runtime/...>.
+# Requires a cmake-out/ build inside the executorch source tree.
+#new_local_repository(
+#    name = "executorch",
+#    build_file = "@//third_party/executorch:BUILD",
+#    path = "/home/lanl/git/executorch",
+#)
+
 #new_local_repository(
 #   name = "tensorrt",
 #   path = "/usr/",
diff --git a/core/runtime/BUILD b/core/runtime/BUILD
index 19260149ae..ef6ae94e00 100644
--- a/core/runtime/BUILD
+++ b/core/runtime/BUILD
@@ -1,6 +1,7 @@
 load("@rules_cc//cc:defs.bzl", "cc_library")
 load("@rules_pkg//:pkg.bzl", "pkg_tar")
 load("@rules_pkg//pkg:mappings.bzl", "pkg_files")
+
 package(default_visibility = ["//visibility:public"])
 
 config_setting(
@@ -58,8 +59,13 @@ config_setting(
     ],
 )
 
+# runtime_base: TRTEngine + device management + serialization utilities.
+# Does NOT include register_jit_hooks.cpp (TorchScript torch::class_ /
+# TORCH_LIBRARY registrations), so it can be linked into
+# libtrt_executorch_backend.so without causing a duplicate-registration
+# crash when libtorchtrt.so is also loaded in the same process.
 cc_library(
-    name = "runtime",
+    name = "runtime_base",
     srcs = [
         "DeviceList.cpp",
         "Platform.cpp",
@@ -67,8 +73,8 @@ cc_library(
         "TRTEngine.cpp",
         "TRTEngineProfiler.cpp",
         "execute_engine.cpp",
-        "register_jit_hooks.cpp",
         "runtime.cpp",
+        "runtime_utils.cpp",
     ],
     hdrs = [
         "Platform.h",
@@ -100,6 +106,26 @@ cc_library(
     alwayslink = True,
 )
 
+# runtime: full runtime including TorchScript torch::class_ / TORCH_LIBRARY
+# registrations. Used by the main libtorchtrt.so.
+cc_library(
+    name = "runtime",
+    srcs = [
+        "register_jit_hooks.cpp",
+    ],
+    hdrs = [
+        "Platform.h",
+        "RTDevice.h",
+        "TRTEngine.h",
+        "TRTEngineProfiler.h",
+        "runtime.h",
+    ],
+    deps = [
+        ":runtime_base",
+    ],
+    alwayslink = True,
+)
+
 filegroup(
     name = "include_files",
     srcs = [
@@ -121,6 +147,6 @@ pkg_tar(
 pkg_files(
     name = "include_pkg_files",
     srcs = [":include_files"],
-    visibility = ["//visibility:public"],
     prefix = "include/torch_tensorrt/core/runtime/",
+    visibility = ["//visibility:public"],
 )
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index d29daa112b..bc96a7a8ca 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -447,6 +447,29 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
 }
 
 void TRTEngine::verify_serialization_fmt(const std::vector<std::string>& serialized_info) {
+  static const char* kIndexNames[] = {
+      "ABI_TARGET_IDX",
+      "NAME_IDX",
+      "DEVICE_IDX",
+      "ENGINE_IDX",
+      "INPUT_BINDING_NAMES_IDX",
+      "OUTPUT_BINDING_NAMES_IDX",
+      "HW_COMPATIBLE_IDX",
+      "SERIALIZED_METADATA_IDX",
+      "TARGET_PLATFORM_IDX",
+      "REQUIRES_OUTPUT_ALLOCATOR_IDX",
+      "RESOURCE_ALLOCATION_STRATEGY_IDX",
+  };
+  fprintf(stderr, "[verify_serialization_fmt] %zu entries (expected %d):\n", serialized_info.size(), SERIALIZATION_LEN);
+  for (size_t i = 0; i < serialized_info.size(); ++i) {
+    const char* name = (i < sizeof(kIndexNames) / sizeof(kIndexNames[0])) ? kIndexNames[i] : "?";
+    if (i == ENGINE_IDX) {
+      fprintf(stderr, "  [%zu] %-35s = <binary, %zu bytes>\n", i, name, serialized_info[i].size());
+    } else {
+      fprintf(stderr, "  [%zu] %-35s = \"%s\"\n", i, name, serialized_info[i].c_str());
+    }
+  }
+
   TORCHTRT_CHECK(
       serialized_info.size() == SERIALIZATION_LEN,
       "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
diff --git a/core/runtime/executorch/BUILD b/core/runtime/executorch/BUILD
new file mode 100644
index 0000000000..93b5cf9b2e
--- /dev/null
+++ b/core/runtime/executorch/BUILD
@@ -0,0 +1,82 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "use_torch_whl",
+    flag_values = {
+        "//toolchains/dep_src:torch": "whl",
+    },
+)
+
+config_setting(
+    name = "rtx_x86_64",
+    constraint_values = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:linux",
+    ],
+    flag_values = {
+        "//toolchains/dep_collection:compute_libs": "rtx",
+    },
+)
+
+config_setting(
+    name = "rtx_win",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+    flag_values = {
+        "//toolchains/dep_collection:compute_libs": "rtx",
+    },
+)
+
+config_setting(
+    name = "sbsa",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+    ],
+    flag_values = {
+        "//toolchains/dep_collection:compute_libs": "default",
+    },
+)
+
+config_setting(
+    name = "jetpack",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+    ],
+    flag_values = {
+        "//toolchains/dep_collection:compute_libs": "jetpack",
+    },
+)
+
+config_setting(
+    name = "windows",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+)
+
+cc_library(
+    name = "tensorrt_executorch_backend",
+    srcs = ["TensorRTBackend.cpp"],
+    hdrs = ["TensorRTBackend.h"],
+    # Use executorch_headers (no static link) so that register_backend /
+    # find_backend / FreeableBuffer etc. remain undefined symbols in the
+    # final libtrt_executorch_backend.so.  At runtime they are resolved from
+    # libqnn_executorch_backend.so, which _portable_lib.so loads with
+    # RTLD_GLOBAL, ensuring both share the same registry instance.
+    deps = [
+        "//core/runtime:runtime_base",
+        "//core/util:prelude",
+        "@executorch//:executorch_headers",
+    ] + select({
+        ":jetpack": ["@tensorrt_l4t//:nvinfer"],
+        ":rtx_win": ["@tensorrt_rtx_win//:nvinfer"],
+        ":rtx_x86_64": ["@tensorrt_rtx//:nvinfer"],
+        ":sbsa": ["@tensorrt_sbsa//:nvinfer"],
+        ":windows": ["@tensorrt_win//:nvinfer"],
+        "//conditions:default": ["@tensorrt//:nvinfer"],
+    }),
+    alwayslink = True,
+)
diff --git a/core/runtime/executorch/TensorRTBackend.cpp b/core/runtime/executorch/TensorRTBackend.cpp
index 93f97dc8ce..ea8e76787f 100644
--- a/core/runtime/executorch/TensorRTBackend.cpp
+++ b/core/runtime/executorch/TensorRTBackend.cpp
@@ -19,6 +19,11 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/platform/log.h>
 
+// RTDevice and Platform must be included before TRTEngine.h because TRTEngine.h
+// references them without including their headers directly (Bazel handles this
+// via transitive deps, but a standalone compile needs them explicit).
+#include "core/runtime/Platform.h"
+#include "core/runtime/RTDevice.h"
 #include "core/runtime/TRTEngine.h"
 #include "core/util/prelude.h"
 
@@ -116,7 +121,18 @@ bool TensorRTBackend::is_available() const {
 // provided by the ExecuTorch MemoryAllocator so that ExecuTorch owns the
 // lifetime; destroy() calls the destructor explicitly.
 // ---------------------------------------------------------------------------
-Result<DelegateHandle*> TensorRTBackend::init(BackendInitContext& context, FreeableBuffer* processed) const {
+Result<DelegateHandle*> TensorRTBackend::init(
+    BackendInitContext& context,
+    FreeableBuffer* processed,
+    ArrayRef<CompileSpec> compile_specs) const {
+  (void)compile_specs;
+  ET_LOG(Info, "TensorRTBackend::init: enter");
+
+  if (!is_available()) {
+    ET_LOG(Error, "TensorRT backend is not available");
+    return Error::NotSupported;
+  }
+
   if (processed == nullptr || processed->data() == nullptr) {
     ET_LOG(Error, "TensorRTBackend::init: null processed buffer");
     return Error::InvalidArgument;
@@ -125,19 +141,31 @@ Result<DelegateHandle*> TensorRTBackend::init(BackendInitContext& context, Freea
   auto serialized_info = deserialize_engine_info(processed->data(), processed->size());
 
   if (serialized_info.empty()) {
+    fprintf(stderr, "[TensorRTBackend::init] FAIL: deserialize_engine_info returned empty\n");
     ET_LOG(Error, "TensorRTBackend::init: failed to deserialize engine blob");
     return Error::InvalidArgument;
   }
+  ET_LOG(Info, "TensorRTBackend::init: deserialized %zu entries", serialized_info.size());
 
   // Validate the vector length before handing to TRTEngine
   // (verify_serialization_fmt throws on mismatch)
-  core::runtime::TRTEngine::verify_serialization_fmt(serialized_info);
+  ET_LOG(Info, "TensorRTBackend::init: calling verify_serialization_fmt");
+  try {
+    core::runtime::TRTEngine::verify_serialization_fmt(serialized_info);
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "TensorRTBackend::init: verify_serialization_fmt threw: %s", e.what());
+    return Error::InvalidArgument;
+  } catch (...) {
+    ET_LOG(Error, "TensorRTBackend::init: verify_serialization_fmt threw unknown exception");
+    return Error::InvalidArgument;
+  }
 
   MemoryAllocator* allocator = context.get_runtime_allocator();
   if (allocator == nullptr) {
     ET_LOG(Error, "TensorRTBackend::init: null runtime allocator");
     return Error::InvalidState;
   }
+  ET_LOG(Info, "TensorRTBackend::init: got allocator");
 
   // Allocate raw storage for TRTEngine from ExecuTorch's arena
   core::runtime::TRTEngine* engine = allocator->allocateInstance<core::runtime::TRTEngine>();
@@ -145,11 +173,23 @@ Result<DelegateHandle*> TensorRTBackend::init(BackendInitContext& context, Freea
     ET_LOG(Error, "TensorRTBackend::init: allocateInstance failed");
     return Error::MemoryAllocationFailed;
   }
+  ET_LOG(Info, "TensorRTBackend::init: allocated engine storage at %p", (void*)engine);
 
   // Construct in-place; TRTEngine(std::vector<std::string>) deserializes the
   // engine bytes, builds the IRuntime/ICudaEngine/IExecutionContext, and
   // populates in_binding_names / out_binding_names / num_io.
-  new (engine) core::runtime::TRTEngine(std::move(serialized_info));
+  ET_LOG(Info, "TensorRTBackend::init: constructing TRTEngine in-place");
+  try {
+    new (engine) core::runtime::TRTEngine(std::move(serialized_info));
+  } catch (const std::exception& e) {
+    fprintf(stderr, "[TensorRTBackend::init] FAIL: TRTEngine constructor threw: %s\n", e.what());
+    ET_LOG(Error, "TensorRTBackend::init: TRTEngine constructor threw: %s", e.what());
+    return Error::InvalidArgument;
+  } catch (...) {
+    fprintf(stderr, "[TensorRTBackend::init] FAIL: TRTEngine constructor threw unknown exception\n");
+    ET_LOG(Error, "TensorRTBackend::init: TRTEngine constructor threw unknown exception");
+    return Error::InvalidArgument;
+  }
 
   // Release the blob; we no longer need it
   processed->Free();
@@ -178,28 +218,58 @@ Result<DelegateHandle*> TensorRTBackend::init(BackendInitContext& context, Freea
 // ---------------------------------------------------------------------------
 Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span<EValue*> args) const {
   (void)context;
+  fprintf(stderr, "[TensorRTBackend::execute] enter: handle=%p args.size()=%zu\n", (void*)handle, args.size());
+  ET_LOG(Info, "TensorRTBackend::execute: enter");
 
   if (handle == nullptr) {
     ET_LOG(Error, "TensorRTBackend::execute: null delegate handle");
     return Error::InvalidArgument;
   }
-
+  ET_LOG(Info, "TensorRTBackend::execute: got delegate handle");
   auto* engine = static_cast<core::runtime::TRTEngine*>(handle);
 
   const size_t num_inputs = engine->num_io.first;
   const size_t num_outputs = engine->num_io.second;
+  ET_LOG(Info, "TensorRTBackend::execute: got num_inputs %zu and num_outputs %zu", num_inputs, num_outputs);
 
   if (args.size() < num_inputs + num_outputs) {
     ET_LOG(
         Error, "TensorRTBackend::execute: expected at least %zu args, got %zu", num_inputs + num_outputs, args.size());
     return Error::InvalidArgument;
   }
-
+  ET_LOG(Info, "TensorRTBackend::execute: got engine");
   // IExecutionContext::enqueueV3 is not thread-safe; use the engine mutex
   std::unique_lock<std::mutex> lock(engine->mu);
 
   nvinfer1::IExecutionContext* ctx = engine->exec_ctx.get();
 
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(engine->device_info.id));
+
+  // ExecuTorch's portable runtime pre-allocates output tensors as CPU buffers.
+  // TRT requires CUDA device pointers for all bindings.  We use
+  // cudaPointerGetAttributes to detect CPU pointers and stage them through
+  // temporary CUDA allocations, copying back after inference.
+  auto is_cuda_ptr = [](const void* ptr) -> bool {
+    if (ptr == nullptr)
+      return false;
+    cudaPointerAttributes attrs{};
+    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
+    return err == cudaSuccess && attrs.type == cudaMemoryTypeDevice;
+  };
+
+  std::vector<void*> temp_input_bufs(num_inputs, nullptr);
+  std::vector<void*> temp_output_bufs(num_outputs, nullptr);
+
+  // Cleanup helper – called on every return path.
+  auto free_temp = [&]() {
+    for (void* p : temp_input_bufs)
+      if (p)
+        cudaFree(p);
+    for (void* p : temp_output_bufs)
+      if (p)
+        cudaFree(p);
+  };
+
   // ------------------------------------------------------------------
   // 1. Bind input shapes and addresses
   // ------------------------------------------------------------------
@@ -207,6 +277,7 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
     EValue* arg = args[i];
     if (arg == nullptr || !arg->isTensor()) {
       ET_LOG(Error, "TensorRTBackend::execute: input %zu is not a tensor", i);
+      free_temp();
       return Error::InvalidArgument;
     }
 
@@ -216,18 +287,31 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
 
     if (!ctx->setInputShape(name.c_str(), dims)) {
       ET_LOG(Error, "TensorRTBackend::execute: setInputShape failed for '%s'", name.c_str());
+      free_temp();
       return Error::InvalidState;
     }
 
-    void* ptr = et_in.mutable_data_ptr();
-    // TRT requires a non-null address even for 0-element tensors
+    void* src_ptr = et_in.mutable_data_ptr();
+    void* trt_ptr = src_ptr;
+
     static char placeholder[16] = {};
-    if (ptr == nullptr || et_in.numel() == 0) {
-      ptr = placeholder;
+    if (src_ptr == nullptr || et_in.numel() == 0) {
+      trt_ptr = placeholder;
+    } else if (!is_cuda_ptr(src_ptr)) {
+      // CPU input: stage to a temporary CUDA buffer
+      size_t nbytes = et_in.nbytes();
+      if (cudaMalloc(&temp_input_bufs[i], nbytes) != cudaSuccess) {
+        ET_LOG(Error, "TensorRTBackend::execute: cudaMalloc failed for input %zu", i);
+        free_temp();
+        return Error::MemoryAllocationFailed;
+      }
+      cudaMemcpyAsync(temp_input_bufs[i], src_ptr, nbytes, cudaMemcpyHostToDevice, stream);
+      trt_ptr = temp_input_bufs[i];
     }
 
-    if (!ctx->setTensorAddress(name.c_str(), ptr)) {
+    if (!ctx->setTensorAddress(name.c_str(), trt_ptr)) {
       ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for input '%s'", name.c_str());
+      free_temp();
       return Error::InvalidState;
     }
   }
@@ -241,26 +325,58 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
     const int32_t n_unresolved = ctx->inferShapes(io_size, unresolved.data());
     if (n_unresolved != 0) {
       ET_LOG(Error, "TensorRTBackend::execute: inferShapes could not resolve %d tensor(s)", n_unresolved);
+      free_temp();
       return Error::InvalidState;
     }
   }
 
   // ------------------------------------------------------------------
-  // 3. Bind output addresses (ExecuTorch pre-allocates the buffers)
+  // 3. Bind output addresses
+  // ExecuTorch pre-allocates output tensors at the maximum shape for
+  // dynamic models.  After inferShapes() TRT knows the actual output
+  // dims, so update the ExecuTorch TensorImpl's sizes before computing
+  // nbytes() and before the Python binding reads back the shape.
+  // If the buffer is CPU, stage through a temporary CUDA allocation.
   // ------------------------------------------------------------------
   for (size_t o = 0; o < num_outputs; ++o) {
     EValue* arg = args[num_inputs + o];
     if (arg == nullptr || !arg->isTensor()) {
       ET_LOG(Error, "TensorRTBackend::execute: output %zu is not a tensor", o);
+      free_temp();
       return Error::InvalidArgument;
     }
 
     exec_aten::Tensor et_out = arg->toTensor();
     const std::string& name = engine->out_binding_names[o];
-    void* ptr = et_out.mutable_data_ptr();
 
-    if (!ctx->setTensorAddress(name.c_str(), ptr)) {
+    // Update the ExecuTorch tensor shape to the actual TRT output shape.
+    // getTensorShape() is valid after inferShapes() has been called.
+    nvinfer1::Dims actual_dims = ctx->getTensorShape(name.c_str());
+    if (actual_dims.nbDims > 0) {
+      exec_aten::SizesType new_sizes[nvinfer1::Dims::MAX_DIMS];
+      for (int d = 0; d < actual_dims.nbDims; ++d) {
+        new_sizes[d] = static_cast<exec_aten::SizesType>(actual_dims.d[d]);
+      }
+      et_out.unsafeGetTensorImpl()->set_sizes_contiguous({new_sizes, static_cast<size_t>(actual_dims.nbDims)});
+    }
+
+    void* dst_ptr = et_out.mutable_data_ptr();
+    void* trt_ptr = dst_ptr;
+
+    if (!is_cuda_ptr(dst_ptr)) {
+      // CPU output buffer: allocate temporary CUDA memory for TRT to write into
+      size_t nbytes = et_out.nbytes(); // uses updated shape
+      if (cudaMalloc(&temp_output_bufs[o], nbytes) != cudaSuccess) {
+        ET_LOG(Error, "TensorRTBackend::execute: cudaMalloc failed for output %zu", o);
+        free_temp();
+        return Error::MemoryAllocationFailed;
+      }
+      trt_ptr = temp_output_bufs[o];
+    }
+
+    if (!ctx->setTensorAddress(name.c_str(), trt_ptr)) {
       ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for output '%s'", name.c_str());
+      free_temp();
       return Error::InvalidState;
     }
   }
@@ -268,16 +384,26 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
   // ------------------------------------------------------------------
   // 4. Enqueue inference on the current CUDA stream
   // ------------------------------------------------------------------
-  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(engine->device_info.id));
-
   if (!ctx->enqueueV3(stream)) {
     ET_LOG(Error, "TensorRTBackend::execute: enqueueV3 failed");
+    free_temp();
     return Error::InvalidState;
   }
 
-  // Synchronize so that outputs are visible to downstream ExecuTorch ops
+  // ------------------------------------------------------------------
+  // 5. Copy temporary CUDA outputs back to the ExecuTorch CPU buffers
+  // ------------------------------------------------------------------
+  for (size_t o = 0; o < num_outputs; ++o) {
+    if (temp_output_bufs[o] != nullptr) {
+      exec_aten::Tensor et_out = args[num_inputs + o]->toTensor();
+      cudaMemcpyAsync(et_out.mutable_data_ptr(), temp_output_bufs[o], et_out.nbytes(), cudaMemcpyDeviceToHost, stream);
+    }
+  }
+
+  // Synchronize so outputs are visible to downstream ExecuTorch ops
   cudaStreamSynchronize(stream);
 
+  free_temp();
   return Error::Ok;
 }
 
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index e8f6217a21..bffb50f03e 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -4,67 +4,14 @@
 #include "core/runtime/runtime.h"
 #include "core/util/macros.h"
 
+// serialize_bindings / base64_encode / base64_decode are defined in
+// runtime_utils.cpp so the ExecuTorch backend can link them without
+// pulling in the torch::class_ registration below.
+
 namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-std::string serialize_bindings(const std::vector<std::string>& bindings) {
-  std::stringstream ss;
-  for (size_t i = 0; i < bindings.size() - 1; i++) {
-    ss << bindings[i] << TRTEngine::BINDING_DELIM;
-  }
-  ss << bindings[bindings.size() - 1];
-
-  std::string serialized_binding_info = ss.str();
-
-  LOG_DEBUG("Serialized Binding Info: " << serialized_binding_info);
-
-  return serialized_binding_info;
-}
-
-static const std::string sym_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //=
-std::string base64_encode(const std::string& in) {
-  std::string out;
-  int64_t val = 0, valb = -6;
-  for (unsigned char c : in) {
-    val = (val << 8) + c;
-    valb += 8;
-    while (valb >= 0) {
-      out.push_back(sym_table[(val >> valb) & 0x3F]);
-      valb -= 6;
-    }
-  }
-  if (valb > -6) {
-    out.push_back(sym_table[((val << 8) >> (valb + 8)) & 0x3F]);
-  };
-  while (out.size() % 4) {
-    out.push_back('=');
-  }
-  return out;
-}
-
-std::string base64_decode(const std::string& in) {
-  std::string out;
-  std::vector<int> T(256, -1);
-  for (int i = 0; i < 64; i++) {
-    T[sym_table[i]] = i;
-  }
-
-  int64_t val = 0, valb = -8;
-  for (unsigned char c : in) {
-    if (T[c] == -1) {
-      break;
-    }
-    val = (val << 6) + T[c];
-    valb += 6;
-    if (valb >= 0) {
-      out.push_back(char((val >> valb) & 0xFF));
-      valb -= 8;
-    }
-  }
-  return out;
-}
-
 namespace {
 // TODO: Implement a call method
 // c10::List<at::Tensor> TRTEngine::Run(c10::List<at::Tensor> inputs) {
diff --git a/core/runtime/runtime_utils.cpp b/core/runtime/runtime_utils.cpp
new file mode 100644
index 0000000000..77b3df07ba
--- /dev/null
+++ b/core/runtime/runtime_utils.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * Serialization utilities shared by TRTEngine and register_jit_hooks.
+ * Kept in a separate translation unit so that the ExecuTorch backend
+ * (libtrt_executorch_backend.so) can link these without pulling in the
+ * TorchScript torch::class_ / TORCH_LIBRARY registrations in
+ * register_jit_hooks.cpp, which would cause a duplicate-registration
+ * crash when libtorchtrt.so is also loaded in the same process.
+ */
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "core/runtime/runtime.h"
+#include "core/util/macros.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+std::string serialize_bindings(const std::vector<std::string>& bindings) {
+  std::stringstream ss;
+  for (size_t i = 0; i < bindings.size() - 1; i++) {
+    ss << bindings[i] << TRTEngine::BINDING_DELIM;
+  }
+  ss << bindings[bindings.size() - 1];
+
+  std::string serialized_binding_info = ss.str();
+
+  LOG_DEBUG("Serialized Binding Info: " << serialized_binding_info);
+
+  return serialized_binding_info;
+}
+
+// Base64 alphabet (RFC 4648 §4)
+static const std::string sym_table = // NOLINT(cert-err58-cpp)
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //=
+
+std::string base64_encode(const std::string& in) {
+  std::string out;
+  int64_t val = 0, valb = -6;
+  for (unsigned char c : in) {
+    val = (val << 8) + c;
+    valb += 8;
+    while (valb >= 0) {
+      out.push_back(sym_table[(val >> valb) & 0x3F]);
+      valb -= 6;
+    }
+  }
+  if (valb > -6) {
+    out.push_back(sym_table[((val << 8) >> (valb + 8)) & 0x3F]);
+  }
+  while (out.size() % 4) {
+    out.push_back('=');
+  }
+  return out;
+}
+
+std::string base64_decode(const std::string& in) {
+  std::string out;
+  std::vector<int> T(256, -1);
+  for (int i = 0; i < 64; i++) {
+    T[sym_table[i]] = i;
+  }
+
+  int64_t val = 0, valb = -8;
+  for (unsigned char c : in) {
+    if (T[c] == -1) {
+      break;
+    }
+    val = (val << 6) + T[c];
+    valb += 6;
+    if (valb >= 0) {
+      out.push_back(char((val >> valb) & 0xFF));
+      valb -= 8;
+    }
+  }
+  return out;
+}
+
+} // namespace runtime
+} // namespace core
+} // namespace torch_tensorrt
diff --git a/cpp/lib/BUILD b/cpp/lib/BUILD
index 9054cd93d0..cdbb40b825 100644
--- a/cpp/lib/BUILD
+++ b/cpp/lib/BUILD
@@ -1,5 +1,6 @@
 load("@rules_cc//cc:defs.bzl", "cc_binary")
 load("@rules_pkg//pkg:mappings.bzl", "pkg_files")
+
 package(default_visibility = ["//visibility:public"])
 
 cc_binary(
@@ -54,10 +55,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "libtrt_executorch_backend.so",
+    srcs = [],
+    # Allow undefined symbols: ExecuTorch runtime functions (register_backend,
+    # find_backend, FreeableBuffer, …) are intentionally left unresolved at
+    # link time.  They are resolved at dlopen() time from
+    # libqnn_executorch_backend.so, which _portable_lib.so has already loaded
+    # into the process with RTLD_GLOBAL.
+    linkopts = ["-Wl,--allow-shlib-undefined"],
+    linkshared = True,
+    linkstatic = True,
+    deps = [
+        "//core/runtime/executorch:tensorrt_executorch_backend",
+    ],
+)
 
 pkg_files(
     name = "lib_pkg_files",
     srcs = ["torchtrt.dll"],
-    visibility = ["//visibility:public"],
     prefix = "lib/",
-)
\ No newline at end of file
+    visibility = ["//visibility:public"],
+)
diff --git a/examples/torchtrt_executorch_example/export_dynamic_shape.py b/examples/torchtrt_executorch_example/export_dynamic_shape.py
new file mode 100644
index 0000000000..62b71cac9e
--- /dev/null
+++ b/examples/torchtrt_executorch_example/export_dynamic_shape.py
@@ -0,0 +1,84 @@
+"""
+.. _executorch_export_dynamic:
+
+Saving a Torch-TensorRT Model with Dynamic Shapes in ExecuTorch Format (.pte)
+==============================================================================
+
+This example demonstrates how to compile a model with Torch-TensorRT using
+dynamic (range-based) input shapes and save it as an ExecuTorch ``.pte`` file.
+
+The TRT engine is built with a shape profile: batch size can vary between 1
+and 8, and the spatial dimensions between 2 and 8, while the channel dimension
+is fixed at 3.  The ExecuTorch runtime will select the correct binding sizes
+at execute() time based on the actual input shapes.
+
+Prerequisites
+-------------
+Install ExecuTorch before running this example::
+
+    pip install executorch
+
+See https://pytorch.org/executorch/stable/getting-started-setup.html for details.
+"""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+import torch
+import torch_tensorrt
+
+
+class MyModel(torch.nn.Module):
+    def forward(self, x):
+        return x + 1
+
+
+# %%
+# Compile with Torch-TensorRT using dynamic shapes
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# torch_tensorrt.Input with min/opt/max shapes builds a TRT optimization
+# profile that covers the full range.  The exported program is traced at the
+# opt shape; min/max define the runtime range accepted by the engine.
+
+with torch.no_grad():
+    model = MyModel().eval().cuda()
+    # Trace at the opt shape
+    opt_input = (torch.randn((4, 3, 4, 4)).cuda(),)
+
+    # Mark variable dimensions so the ExecuTorch .pte allocates them as
+    # resizable tensors.  Without dynamic_shapes the portable runtime
+    # pre-allocates fixed-size tensors at the export shape and rejects
+    # inputs of any other size at execute() time.
+    batch = torch.export.Dim("batch", min=1, max=8)
+    spatial = torch.export.Dim("spatial", min=2, max=8)
+    dynamic_shapes = {"x": {0: batch, 2: spatial, 3: spatial}}
+
+    exported_program = torch.export.export(
+        model, opt_input, dynamic_shapes=dynamic_shapes
+    )
+    compile_settings = {
+        "arg_inputs": [
+            torch_tensorrt.Input(
+                min_shape=(1, 3, 2, 2),
+                opt_shape=(4, 3, 4, 4),
+                max_shape=(8, 3, 8, 8),
+                dtype=torch.float32,
+            ),
+        ],
+        "min_block_size": 1,
+    }
+    trt_gm = torch_tensorrt.dynamo.compile(exported_program, **compile_settings)
+
+    # %%
+    # Save as ExecuTorch .pte format
+    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    torch_tensorrt.save(
+        trt_gm,
+        "model_dynamic.pte",
+        output_format="executorch",
+        arg_inputs=opt_input,
+        retrace=False,
+    )
+
+    print("Saved model_dynamic.pte successfully.")
diff --git a/examples/torchtrt_executorch_example/load_dynamic_shape.py b/examples/torchtrt_executorch_example/load_dynamic_shape.py
new file mode 100644
index 0000000000..a7fae2d9e6
--- /dev/null
+++ b/examples/torchtrt_executorch_example/load_dynamic_shape.py
@@ -0,0 +1,82 @@
+"""
+.. _executorch_load_dynamic:
+
+Loading a Torch-TensorRT Dynamic-Shape Model from ExecuTorch Format (.pte)
+===========================================================================
+
+This example demonstrates how to load a ``.pte`` file produced by
+``export_dynamic_shape.py`` and run inference at several different input
+shapes within the compiled min/max range.
+
+Prerequisites
+-------------
+- ExecuTorch installed with a runtime that includes the TensorRT backend.
+- Run ``export_dynamic_shape.py`` first to produce ``model_dynamic.pte``.
+"""
+
+# %%
+# Imports
+# ^^^^^^^
+
+import ctypes
+import os
+
+import torch
+import torch_tensorrt  # noqa: F401  -- loads libtorchtrt.so / libtorchtrt_runtime.so
+
+# libqnn_executorch_backend.so carries the ExecuTorch runtime (including
+# executorch::runtime::internal::vlogf and register_backend). It must be
+# loaded with RTLD_GLOBAL so its symbols are visible to subsequently
+# dlopen'd libraries (libtrt_executorch_backend.so and portable_lib).
+_executorch_path = os.environ.get("EXECUTORCH_PATH", "/home/lanl/git/executorch")
+ctypes.CDLL(
+    os.path.join(_executorch_path, "backends/qualcomm/libqnn_executorch_backend.so"),
+    mode=ctypes.RTLD_GLOBAL,
+)
+
+# Load the TensorRT ExecuTorch backend shared library, which runs a static
+# initializer that calls executorch::runtime::register_backend("TensorRTBackend").
+# Resolves runtime symbols from libqnn_executorch_backend.so loaded above.
+_lib_dir = os.path.join(os.path.dirname(torch_tensorrt.__file__), "lib")
+ctypes.CDLL(os.path.join(_lib_dir, "libtrt_executorch_backend.so"))
+
+from executorch.extension.pybindings import portable_lib as runtime
+
+# %%
+# Load the .pte file
+# ^^^^^^^^^^^^^^^^^^
+
+executorch_module = runtime._load_for_executorch("model_dynamic.pte")
+
+# %%
+# Run inference at multiple shapes within the compiled profile
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Inputs must be CPU tensors; execute() stages them to CUDA internally.
+# All shapes must lie within the min/max range used at export time:
+#   batch: [1, 8]  channels: 3 (fixed)  spatial: [2, 8]
+
+
+class MyModel(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + 1
+
+
+ref_model = MyModel().eval()
+
+test_shapes = [
+    (1, 3, 2, 2),  # minimum shape
+    (4, 3, 4, 4),  # opt shape (used at export time)
+    (8, 3, 8, 8),  # maximum shape
+    (2, 3, 6, 6),  # arbitrary shape within range
+]
+
+for shape in test_shapes:
+    example_input = torch.randn(shape)  # CPU tensor
+    outputs = executorch_module.forward([example_input])
+    with torch.no_grad():
+        expected = ref_model(example_input)
+
+    torch.testing.assert_close(outputs[0], expected, rtol=1e-3, atol=1e-3)
+    print(f"shape={shape}  output={outputs[0].shape}  dtype={outputs[0].dtype}  OK")
+
+print("All dynamic-shape inference runs passed.")
diff --git a/examples/torchtrt_executorch_example/load_static_shape.py b/examples/torchtrt_executorch_example/load_static_shape.py
new file mode 100644
index 0000000000..ac27fcbbd9
--- /dev/null
+++ b/examples/torchtrt_executorch_example/load_static_shape.py
@@ -0,0 +1,80 @@
+"""
+.. _executorch_load:
+
+Loading a Torch-TensorRT Model from ExecuTorch Format (.pte)
+=============================================================
+
+This example demonstrates how to load a ``.pte`` file produced by
+``export_static_shape.py`` and run inference with the ExecuTorch runtime.
+
+Prerequisites
+-------------
+- ExecuTorch installed with a runtime that includes the TensorRT backend.
+- Run ``export_static_shape.py`` first to produce ``model.pte``.
+"""
+
+# %%
+# Imports
+# ^^^^^^^
+
+import ctypes
+import os
+
+import torch
+import torch_tensorrt  # noqa: F401  -- loads libtorchtrt.so / libtorchtrt_runtime.so
+
+# libqnn_executorch_backend.so carries the ExecuTorch runtime (including
+# executorch::runtime::internal::vlogf and register_backend). It must be
+# loaded with RTLD_GLOBAL so its symbols are visible to subsequently
+# dlopen'd libraries (libtrt_executorch_backend.so and portable_lib).
+_executorch_path = os.environ.get("EXECUTORCH_PATH", "/home/lanl/git/executorch")
+ctypes.CDLL(
+    os.path.join(_executorch_path, "backends/qualcomm/libqnn_executorch_backend.so"),
+    mode=ctypes.RTLD_GLOBAL,
+)
+
+# Load the TensorRT ExecuTorch backend shared library, which runs a static
+# initializer that calls executorch::runtime::register_backend("TensorRTBackend").
+# Resolves runtime symbols from libqnn_executorch_backend.so loaded above.
+_lib_dir = os.path.join(os.path.dirname(torch_tensorrt.__file__), "lib")
+ctypes.CDLL(os.path.join(_lib_dir, "libtrt_executorch_backend.so"))
+
+from executorch.extension.pybindings import portable_lib as runtime
+
+# %%
+# Load the .pte file
+# ^^^^^^^^^^^^^^^^^^
+# _load_for_executorch returns an ExecuTorchModule whose methods mirror the
+# original model's exported methods (e.g. "forward").
+
+executorch_module = runtime._load_for_executorch("model.pte")
+# %%
+# Run inference
+# ^^^^^^^^^^^^^
+# Inputs must be passed as a list of tensors matching the static shapes used
+# at export time: (2, 3, 4, 4) float32 on CUDA.
+
+example_input = torch.randn(
+    (2, 3, 4, 4)
+)  # CPU tensor; execute() stages it to CUDA internally
+outputs = executorch_module.forward([example_input])
+
+print("Output shape:", outputs[0].shape)
+print("Output dtype:", outputs[0].dtype)
+
+# %%
+# Verify against eager mode
+# ^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+class MyModel(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + 1
+
+
+model = MyModel().eval()
+with torch.no_grad():
+    expected = model(example_input)
+
+torch.testing.assert_close(outputs[0], expected, rtol=1e-3, atol=1e-3)
+print("Output matches eager mode.")
diff --git a/third_party/executorch/BUILD b/third_party/executorch/BUILD
new file mode 100644
index 0000000000..470e2088f2
--- /dev/null
+++ b/third_party/executorch/BUILD
@@ -0,0 +1,26 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+# This BUILD file is used both by:
+#   - toolchains/local_executorch.bzl  (symlinks runtime/, extension/, cmake-out/
+#     into the synthetic repo root)
+#   - new_local_repository(path = "${EXECUTORCH_PATH}")  (repo root IS the
+#     executorch source tree)
+#
+# In both cases the repo root contains runtime/, extension/, and cmake-out/.
+# include_prefix = "executorch" remaps runtime/... → <executorch/runtime/...>.
+
+# Headers-only: provides include paths for compilation but does NOT link any
+# static ExecuTorch library.  ExecuTorch runtime symbols (register_backend,
+# find_backend, FreeableBuffer, …) are left undefined and resolved at dlopen()
+# time from libqnn_executorch_backend.so, which _portable_lib.so loads with
+# RTLD_GLOBAL.  This ensures all backends share the same registry instance.
+cc_library(
+    name = "executorch_headers",
+    hdrs = glob([
+        "runtime/**/*.h",
+        "extension/**/*.h",
+    ]),
+    include_prefix = "executorch",
+)
diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl
index 7f386d0a4f..5201262196 100644
--- a/toolchains/ci_workspaces/MODULE.bazel.tmpl
+++ b/toolchains/ci_workspaces/MODULE.bazel.tmpl
@@ -150,6 +150,14 @@ new_local_repository(
     build_file = "third_party/libtorch/BUILD"
 )
 
+# ExecuTorch source tree.  EXECUTORCH_PATH must point to the directory that
+# contains runtime/, extension/, and cmake-out/libexecutorch_core.a.
+new_local_repository(
+    name = "executorch",
+    path = "${EXECUTORCH_PATH}",
+    build_file = "third_party/executorch/BUILD"
+)
+
 #new_local_repository(
 #   name = "tensorrt",
 #   path = "/usr/",
diff --git a/toolchains/local_executorch.bzl b/toolchains/local_executorch.bzl
new file mode 100644
index 0000000000..ce52602f99
--- /dev/null
+++ b/toolchains/local_executorch.bzl
@@ -0,0 +1,101 @@
+"""Repository rule that locates the locally installed ExecuTorch source tree.
+
+Discovery order:
+  1. EXECUTORCH_PATH env var — absolute path to the executorch source root
+                               (the directory containing runtime/, extension/)
+  2. import executorch from the active Python interpreter — walks up from the
+     package directory to find the source root that contains runtime/backend/
+  3. VIRTUAL_ENV / CONDA_PREFIX / .venv / system python3
+
+Only the header files under runtime/ and extension/ are used; no cmake build
+of libexecutorch_core.a is required.  ExecuTorch runtime symbols are resolved
+at dlopen() time from libqnn_executorch_backend.so (loaded by _portable_lib.so).
+"""
+
+def _find_python(ctx):
+    candidates = []
+
+    virtual_env = ctx.os.environ.get("VIRTUAL_ENV", "")
+    if virtual_env:
+        candidates.append(ctx.path(virtual_env + "/bin/python3"))
+        candidates.append(ctx.path(virtual_env + "/bin/python"))
+
+    conda_prefix = ctx.os.environ.get("CONDA_PREFIX", "")
+    if conda_prefix:
+        candidates.append(ctx.path(conda_prefix + "/bin/python3"))
+        candidates.append(ctx.path(conda_prefix + "/bin/python"))
+
+    ws = ctx.workspace_root
+    for rel in [".venv/bin/python3", ".venv/bin/python", "venv/bin/python3", "venv/bin/python"]:
+        candidates.append(ws.get_child(rel))
+
+    for name in ["python3", "python"]:
+        p = ctx.which(name)
+        if p:
+            candidates.append(p)
+
+    for candidate in candidates:
+        if candidate.exists:
+            return candidate
+    return None
+
+def _find_executorch_source(ctx):
+    """Return the path to the executorch source root, or None."""
+
+    # 1. Env-var override
+    et_path = ctx.os.environ.get("EXECUTORCH_PATH", "").strip()
+    if et_path:
+        p = ctx.path(et_path)
+        if p.exists:
+            return p
+        fail("EXECUTORCH_PATH is set to '{}' but that directory does not exist.".format(et_path))
+
+    # 2. Python import — walk up from the package to find the source root
+    python = _find_python(ctx)
+    if python:
+        result = ctx.execute([
+            python,
+            "-c",
+            "\n".join([
+                "import executorch, os",
+                "pkg = os.path.dirname(executorch.__file__)",
+                "# Walk upward from the package looking for runtime/backend/",
+                "for d in [pkg, os.path.join(pkg, '..'), os.path.join(pkg, '..', '..')]:",
+                "    d = os.path.realpath(d)",
+                "    if os.path.isdir(os.path.join(d, 'runtime', 'backend')):",
+                "        print(d)",
+                "        break",
+            ]),
+        ])
+        if result.return_code == 0 and result.stdout.strip():
+            p = ctx.path(result.stdout.strip())
+            if p.exists:
+                return p
+
+    return None
+
+def _local_executorch_impl(ctx):
+    et_dir = _find_executorch_source(ctx)
+    if et_dir == None:
+        fail(
+            "Cannot locate the ExecuTorch source tree. " +
+            "Set EXECUTORCH_PATH to the directory that contains runtime/, " +
+            "extension/, and cmake-out/ (e.g. export EXECUTORCH_PATH=/path/to/executorch). " +
+            "Ensure that cmake-out/libexecutorch_core.a has been built.",
+        )
+
+    # Symlink the subdirectories referenced by the BUILD file into the synthetic
+    # repo root, mirroring the new_local_repository(path=EXECUTORCH_PATH) layout.
+    # include_prefix = "executorch" in the BUILD file handles the header remapping.
+    # Note: cmake-out is no longer needed (executorch_core static lib was removed).
+    for sub in ["runtime", "extension"]:
+        child = et_dir.get_child(sub)
+        if child.exists:
+            ctx.symlink(child, sub)
+
+    ctx.file("BUILD", ctx.read(Label("@//third_party/executorch:BUILD")))
+
+local_executorch = repository_rule(
+    implementation = _local_executorch_impl,
+    environ = ["EXECUTORCH_PATH", "VIRTUAL_ENV", "CONDA_PREFIX"],
+)