diff --git a/benchmarks/gemm/benchmark_gemm.py b/benchmarks/gemm/benchmark_gemm.py
new file mode 100644
index 0000000000..5b83b0b8c1
--- /dev/null
+++ b/benchmarks/gemm/benchmark_gemm.py
@@ -0,0 +1,1609 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+"""Unified GEMM benchmark for BF16, FP8 Block, MXFP8, and NVFP4 precisions.
+
+Compares matrix-multiplication throughput across precisions using
+Transformer Engine on NVIDIA GPUs.  Supports two timing back-ends,
+pre-quantized and autocast quantization modes, arbitrary MxKxN matrix
+shapes, Nsight Systems profiling integration, and bar-chart output.
+
+Timing back-ends
+----------------
+* **cuda-events** -- CUDA event pairs with a leading-kernel trick to
+  hide CPU dispatch latency.  Measures the full GPU-side duration of
+  the timed loop (includes quantisation when using autocast mode).
+* **profiler** -- ``torch.profiler`` (CUPTI) kernel timestamps.
+  Only the matched GEMM compute kernels (nvjet, xmma, cutlass, cublas)
+  are summed, giving a kernel-only measurement.
+
+Usage examples::
+
+    # Kernel-only timing via torch.profiler:
+    python benchmarks/gemm/benchmark_gemm.py --timing profiler --pre-quantize -o kernel.png
+
+    # End-to-end timing via CUDA events:
+    python benchmarks/gemm/benchmark_gemm.py --timing cuda-events -o e2e.png
+
+    # Custom non-square shapes:
+    python benchmarks/gemm/benchmark_gemm.py --shapes 88064x2560x10240,88064x10240x2560
+
+    # Nsight profiling of a single shape:
+    nsys profile --capture-range=cudaProfilerApi \\
+        python benchmarks/gemm/benchmark_gemm.py --profile --profile-shape 4096
+
+    # Model config mode (derives all 12 GEMM shapes from hyperparameters):
+    python benchmarks/gemm/benchmark_gemm.py \\
+        --hidden_size 4096 --intermediate_size 16384 \\
+        --num_attention_heads 32 --num_hidden_layers 24 \\
+        --micro_batch_size 31 --sequence_length 512
+"""
+
+import argparse
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch.profiler import ProfilerActivity, profile
+
+try:
+    import transformer_engine.pytorch as te
+    import transformer_engine_torch as tex
+    from transformer_engine.common.recipe import (
+        Float8BlockScaling,
+        Format,
+        MXFP8BlockScaling,
+        NVFP4BlockScaling,
+    )
+
+    TE_AVAILABLE = True
+except ImportError:
+    TE_AVAILABLE = False
+
+
+GEMM_KERNEL_PATTERNS = ("gemm", "nvjet", "xmma", "cutlass")
+
+PRECISION_COLORS = {
+    "BF16": "#808080",
+    "FP8Block": "#006400",
+    "MXFP8": "#4B0082",
+    "NVFP4": "#B22222",
+}
+
+
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+@dataclass
+class GEMMResult:
+    """Single GEMM benchmark measurement."""
+
+    tflops: float
+    avg_time_ms: float
+    shape: tuple[int, int, int]
+    precision: str
+
+
+@dataclass
+class ModelConfig:
+    """Transformer model hyperparameters for GEMM shape derivation."""
+
+    hidden_size: int
+    intermediate_size: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    micro_batch_size: int
+    sequence_length: int
+
+
+# ---------------------------------------------------------------------------
+# Hardware helpers
+# ---------------------------------------------------------------------------
+def is_blackwell_available() -> bool:
+    """Return True when the current device is Blackwell (SM100+) for NVFP4 support."""
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability()
+    return major >= 10
+
+
+def compute_gemm_flops(M: int, K: int, N: int) -> int:
+    """Theoretical FLOP count for C = A @ B:  2 * M * N * K."""
+    return 2 * M * N * K
+
+
+# ---------------------------------------------------------------------------
+# torch.profiler helpers  (kernel-only timing)
+# ---------------------------------------------------------------------------
+def _is_gemm_kernel(name: str) -> bool:
+    """Return True when *name* looks like a GEMM compute kernel."""
+    low = name.lower()
+    return any(p in low for p in GEMM_KERNEL_PATTERNS)
+
+
+def _extract_gemm_kernel_time_us(
+    prof_result: profile,
+    num_iters: int,
+    verbose: bool = False,
+) -> float:
+    """Average GEMM-kernel time in microseconds from profiler events."""
+    total_us = 0.0
+    count = 0
+    seen: dict[str, float] = {}
+
+    for evt in prof_result.events():
+        if evt.device_type == torch.autograd.DeviceType.CUDA and _is_gemm_kernel(evt.name):
+            total_us += evt.device_time
+            count += 1
+            seen[evt.name] = seen.get(evt.name, 0.0) + evt.device_time
+
+    if verbose and seen:
+        print(f"    Matched GEMM kernels ({count} invocations):")
+        for kname, kus in seen.items():
+            print(f"      {kname}: {kus:.0f} us total")
+
+    if count == 0:
+        if verbose:
+            print("    WARNING: No GEMM kernels found.  All CUDA events:")
+            for evt in prof_result.events():
+                if evt.device_type == torch.autograd.DeviceType.CUDA:
+                    print(f"      {evt.name}: {evt.device_time:.0f} us")
+        return 0.0
+
+    return total_us / num_iters
+
+
+# ---------------------------------------------------------------------------
+# Timing wrappers
+# ---------------------------------------------------------------------------
+def _time_with_profiler(
+    run_fn,
+    num_iters: int,
+    flops: int,
+    verbose: bool = False,
+) -> tuple[float, float]:
+    """Return (tflops, avg_ms) using torch.profiler kernel extraction."""
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        for _ in range(num_iters):
+            run_fn()
+        torch.cuda.synchronize()
+
+    avg_us = _extract_gemm_kernel_time_us(prof, num_iters, verbose=verbose)
+    avg_s = avg_us / 1e6
+    tflops = (flops / avg_s) / 1e12 if avg_s > 0 else 0.0
+    return tflops, avg_us / 1000.0
+
+
+def _time_with_cuda_events(
+    run_fn,
+    num_iters: int,
+    flops: int,
+    leading_fn=None,
+) -> tuple[float, float]:
+    """Return (tflops, avg_ms) using CUDA events with optional leading kernel."""
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    if leading_fn is not None:
+        leading_fn()
+
+    start.record()
+    for _ in range(num_iters):
+        run_fn()
+    end.record()
+    torch.cuda.synchronize()
+
+    avg_ms = start.elapsed_time(end) / num_iters
+    avg_s = avg_ms / 1000.0
+    tflops = (flops / avg_s) / 1e12 if avg_s > 0 else 0.0
+    return tflops, avg_ms
+
+
+# ---------------------------------------------------------------------------
+# BF16 benchmark
+# ---------------------------------------------------------------------------
+def benchmark_bf16(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> GEMMResult:
+    """Benchmark BF16 torch.matmul."""
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    A = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    B = torch.randn(K, N, dtype=torch.bfloat16, device=device)
+
+    for _ in range(num_warmup):
+        torch.matmul(A, B)
+    torch.cuda.synchronize()
+
+    def _run():
+        torch.matmul(A, B)
+
+    if timing == "profiler":
+        tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+    else:
+        A_lg = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+        B_lg = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+        tflops, avg_ms = _time_with_cuda_events(
+            _run, num_iters, flops, leading_fn=lambda: torch.matmul(A_lg, B_lg)
+        )
+        del A_lg, B_lg
+
+    return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="BF16")
+
+
+# ---------------------------------------------------------------------------
+# MXFP8 benchmarks
+# ---------------------------------------------------------------------------
+def benchmark_fp8(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """MXFP8 GEMM via te.Linear autocast."""
+    if not TE_AVAILABLE:
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    linear = te.Linear(K, N, bias=False, params_dtype=torch.bfloat16).to(device)
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    recipe = MXFP8BlockScaling(fp8_format=Format.E4M3)
+
+    with te.autocast(enabled=True, recipe=recipe):
+        for _ in range(num_warmup):
+            linear(x)
+        torch.cuda.synchronize()
+
+        def _run():
+            linear(x)
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            lin_lg = te.Linear(4096, 4096, bias=False, params_dtype=torch.bfloat16).to(device)
+            x_lg = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            tflops, avg_ms = _time_with_cuda_events(
+                _run, num_iters, flops, leading_fn=lambda: lin_lg(x_lg)
+            )
+            del lin_lg, x_lg
+
+    return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="MXFP8")
+
+
+def benchmark_fp8_prequantized(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """Pre-quantized MXFP8 GEMM via tex.generic_gemm (raw kernel throughput)."""
+    if not TE_AVAILABLE:
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    try:
+        quantizer = te.MXFP8Quantizer(tex.DType.kFloat8E4M3)
+
+        # tex.generic_gemm uses column-major convention: A=(K,M), B=(K,N),
+        # D=(N,M) with transa=False, transb=True for a logical C(M,N) GEMM.
+        A_q = quantizer.quantize(torch.randn(K, M, dtype=torch.bfloat16, device=device))
+        B_q = quantizer.quantize(torch.randn(K, N, dtype=torch.bfloat16, device=device))
+        D = torch.empty(N, M, dtype=torch.bfloat16, device=device)
+        ws_size = 32 * 1024 * 1024
+        ws = torch.empty(ws_size, dtype=torch.uint8, device=device)
+
+        def _run():
+            tex.generic_gemm(
+                A_q,
+                False,
+                B_q,
+                True,
+                D,
+                None,
+                tex.DType.kBFloat16,
+                None,
+                tex.DType.kBFloat16,
+                False,
+                None,
+                False,
+                ws,
+                ws_size,
+                False,
+                False,
+            )
+
+        for _ in range(num_warmup):
+            _run()
+        torch.cuda.synchronize()
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            A_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            B_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            D_lg = torch.empty(4096, 4096, dtype=torch.bfloat16, device=device)
+
+            def _lead():
+                tex.generic_gemm(
+                    A_lg_q,
+                    False,
+                    B_lg_q,
+                    True,
+                    D_lg,
+                    None,
+                    tex.DType.kBFloat16,
+                    None,
+                    tex.DType.kBFloat16,
+                    False,
+                    None,
+                    False,
+                    ws,
+                    ws_size,
+                    False,
+                    False,
+                )
+
+            tflops, avg_ms = _time_with_cuda_events(_run, num_iters, flops, leading_fn=_lead)
+            del A_lg_q, B_lg_q, D_lg
+
+        return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="MXFP8")
+    except Exception as e:
+        print(f"Warning: FP8 prequantized benchmark failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Float8 Block-Scaling benchmarks
+# ---------------------------------------------------------------------------
+def benchmark_fp8_block(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """FP8 GEMM with Float8BlockScaling recipe via te.Linear autocast."""
+    if not TE_AVAILABLE:
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    linear = te.Linear(K, N, bias=False, params_dtype=torch.bfloat16).to(device)
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    recipe = Float8BlockScaling(fp8_format=Format.E4M3)
+
+    with te.autocast(enabled=True, recipe=recipe):
+        for _ in range(num_warmup):
+            linear(x)
+        torch.cuda.synchronize()
+
+        def _run():
+            linear(x)
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            lin_lg = te.Linear(4096, 4096, bias=False, params_dtype=torch.bfloat16).to(device)
+            x_lg = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            tflops, avg_ms = _time_with_cuda_events(
+                _run, num_iters, flops, leading_fn=lambda: lin_lg(x_lg)
+            )
+            del lin_lg, x_lg
+
+    return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="FP8Block")
+
+
+def benchmark_fp8_block_prequantized(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """Pre-quantized FP8 GEMM with Float8BlockScaling via tex.generic_gemm."""
+    if not TE_AVAILABLE:
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    try:
+        quantizer = te.Float8BlockQuantizer(
+            tex.DType.kFloat8E4M3,
+            rowwise=True,
+            columnwise=True,
+        )
+
+        A_q = quantizer.quantize(torch.randn(K, M, dtype=torch.bfloat16, device=device))
+        B_q = quantizer.quantize(torch.randn(K, N, dtype=torch.bfloat16, device=device))
+        D = torch.empty(N, M, dtype=torch.bfloat16, device=device)
+        ws_size = 32 * 1024 * 1024
+        ws = torch.empty(ws_size, dtype=torch.uint8, device=device)
+
+        def _run():
+            tex.generic_gemm(
+                A_q,
+                False,
+                B_q,
+                True,
+                D,
+                None,
+                tex.DType.kBFloat16,
+                None,
+                tex.DType.kBFloat16,
+                False,
+                None,
+                False,
+                ws,
+                ws_size,
+                False,
+                False,
+            )
+
+        for _ in range(num_warmup):
+            _run()
+        torch.cuda.synchronize()
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            A_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            B_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            D_lg = torch.empty(4096, 4096, dtype=torch.bfloat16, device=device)
+
+            def _lead():
+                tex.generic_gemm(
+                    A_lg_q,
+                    False,
+                    B_lg_q,
+                    True,
+                    D_lg,
+                    None,
+                    tex.DType.kBFloat16,
+                    None,
+                    tex.DType.kBFloat16,
+                    False,
+                    None,
+                    False,
+                    ws,
+                    ws_size,
+                    False,
+                    False,
+                )
+
+            tflops, avg_ms = _time_with_cuda_events(_run, num_iters, flops, leading_fn=_lead)
+            del A_lg_q, B_lg_q, D_lg
+
+        return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="FP8Block")
+    except Exception as e:
+        print(f"Warning: FP8 Block-Scaling prequantized benchmark failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# NVFP4 benchmarks  (Blackwell SM100+ only)
+# ---------------------------------------------------------------------------
+def benchmark_fp4(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """NVFP4 GEMM via te.Linear autocast (Blackwell only)."""
+    if not TE_AVAILABLE or not is_blackwell_available():
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    linear = te.Linear(K, N, bias=False, params_dtype=torch.bfloat16).to(device)
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    recipe = NVFP4BlockScaling(fp4_format=Format.E2M1)
+
+    with te.autocast(enabled=True, recipe=recipe):
+        for _ in range(num_warmup):
+            linear(x)
+        torch.cuda.synchronize()
+
+        def _run():
+            linear(x)
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            lin_lg = te.Linear(4096, 4096, bias=False, params_dtype=torch.bfloat16).to(device)
+            x_lg = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            tflops, avg_ms = _time_with_cuda_events(
+                _run, num_iters, flops, leading_fn=lambda: lin_lg(x_lg)
+            )
+            del lin_lg, x_lg
+
+    return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="NVFP4")
+
+
+def benchmark_fp4_prequantized(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    timing: str = "cuda-events",
+    verbose: bool = False,
+) -> Optional[GEMMResult]:
+    """Pre-quantized NVFP4 GEMM via tex.generic_gemm (Blackwell only)."""
+    if not TE_AVAILABLE or not is_blackwell_available():
+        return None
+
+    device = torch.device("cuda")
+    flops = compute_gemm_flops(M, K, N)
+
+    try:
+        quantizer = te.NVFP4Quantizer(tex.DType.kFloat4E2M1)
+
+        # tex.generic_gemm uses column-major convention: A=(K,M), B=(K,N),
+        # D=(N,M) with transa=False, transb=True for a logical C(M,N) GEMM.
+        A_q = quantizer.quantize(torch.randn(K, M, dtype=torch.bfloat16, device=device))
+        B_q = quantizer.quantize(torch.randn(K, N, dtype=torch.bfloat16, device=device))
+        D = torch.empty(N, M, dtype=torch.bfloat16, device=device)
+        ws_size = 32 * 1024 * 1024
+        ws = torch.empty(ws_size, dtype=torch.uint8, device=device)
+
+        def _run():
+            tex.generic_gemm(
+                A_q,
+                False,
+                B_q,
+                True,
+                D,
+                None,
+                tex.DType.kBFloat16,
+                None,
+                tex.DType.kBFloat16,
+                False,
+                None,
+                False,
+                ws,
+                ws_size,
+                False,
+                False,
+            )
+
+        for _ in range(num_warmup):
+            _run()
+        torch.cuda.synchronize()
+
+        if timing == "profiler":
+            tflops, avg_ms = _time_with_profiler(_run, num_iters, flops, verbose=verbose)
+        else:
+            A_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            B_lg_q = quantizer.quantize(
+                torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+            )
+            D_lg = torch.empty(4096, 4096, dtype=torch.bfloat16, device=device)
+
+            def _lead():
+                tex.generic_gemm(
+                    A_lg_q,
+                    False,
+                    B_lg_q,
+                    True,
+                    D_lg,
+                    None,
+                    tex.DType.kBFloat16,
+                    None,
+                    tex.DType.kBFloat16,
+                    False,
+                    None,
+                    False,
+                    ws,
+                    ws_size,
+                    False,
+                    False,
+                )
+
+            tflops, avg_ms = _time_with_cuda_events(_run, num_iters, flops, leading_fn=_lead)
+            del A_lg_q, B_lg_q, D_lg
+
+        return GEMMResult(tflops=tflops, avg_time_ms=avg_ms, shape=(M, K, N), precision="NVFP4")
+    except Exception as e:
+        print(f"Warning: FP4 prequantized benchmark failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Shape helpers
+# ---------------------------------------------------------------------------
+def get_default_shapes() -> list[tuple[int, int, int]]:
+    """Default set of square matrix shapes for benchmarking."""
+    return [
+        (256, 256, 256),
+        (512, 512, 512),
+        (768, 768, 768),
+        (1024, 1024, 1024),
+        (1536, 1536, 1536),
+        (2048, 2048, 2048),
+        (3072, 3072, 3072),
+        (4096, 4096, 4096),
+        (6144, 6144, 6144),
+        (8192, 8192, 8192),
+        (16384, 16384, 16384),
+    ]
+
+
+def parse_shapes_arg(shapes_arg: str) -> list[tuple[int, int, int]]:
+    """Parse ``--shapes`` into a list of (M, K, N) tuples.
+
+    Accepts either square sizes (``1024,2048,4096``) or explicit
+    triplets (``8192x5120x10240,8192x10240x5120``), or a mix.
+
+    Raises:
+        ValueError: On malformed input.
+    """
+    items = [s.strip() for s in shapes_arg.split(",") if s.strip()]
+    if not items:
+        raise ValueError("Empty --shapes argument.")
+
+    shapes: list[tuple[int, int, int]] = []
+    for item in items:
+        if "x" in item:
+            parts = [p.strip() for p in item.lower().split("x")]
+            if len(parts) != 3:
+                raise ValueError(f"Invalid shape '{item}'.  Expected 'MxKxN'.")
+            shapes.append((int(parts[0]), int(parts[1]), int(parts[2])))
+        else:
+            size = int(item)
+            shapes.append((size, size, size))
+    return shapes
+
+
+def compute_gemm_shapes(
+    config: ModelConfig,
+) -> tuple[
+    list[tuple[str, int, int, int]],
+    list[tuple[str, int, int, int]],
+    list[tuple[str, int, int, int]],
+]:
+    """Derive Fprop, Dgrad, and Wgrad GEMM shapes from a transformer model config.
+
+    For forward Y = X @ W with shape (M, K, N):
+      - Dgrad: dX = dY @ Wᵀ  →  (M, N, K)  (K and N swap)
+      - Wgrad: dW = Xᵀ @ dY  →  (K, M, N)  (M moves to contraction axis)
+
+    Returns:
+        (fprop_shapes, dgrad_shapes, wgrad_shapes) where each is a list of
+        (label, M, K, N) tuples.
+    """
+    H = config.hidden_size
+    I = config.intermediate_size
+    M = config.micro_batch_size * config.sequence_length
+
+    if H % config.num_attention_heads != 0:
+        raise ValueError(
+            f"hidden_size ({H}) must be divisible by "
+            f"num_attention_heads ({config.num_attention_heads})"
+        )
+
+    N_qkv = 3 * H
+
+    fprop_shapes = [
+        ("QKV Proj", M, H, N_qkv),
+        ("Attn Out", M, H, H),
+        ("MLP Up", M, H, I),
+        ("MLP Down", M, I, H),
+    ]
+
+    dgrad_shapes = [
+        ("QKV Proj (Dgrad)", M, N_qkv, H),
+        ("Attn Out (Dgrad)", M, H, H),
+        ("MLP Up (Dgrad)", M, I, H),
+        ("MLP Down (Dgrad)", M, H, I),
+    ]
+
+    wgrad_shapes = [
+        ("QKV Proj (Wgrad)", H, M, N_qkv),
+        ("Attn Out (Wgrad)", H, M, H),
+        ("MLP Up (Wgrad)", H, M, I),
+        ("MLP Down (Wgrad)", I, M, H),
+    ]
+
+    return fprop_shapes, dgrad_shapes, wgrad_shapes
+
+
+# ---------------------------------------------------------------------------
+# GPU warmup
+# ---------------------------------------------------------------------------
+def warmup_gpu(duration_seconds: float = 5.0) -> None:
+    """Run sustained matmuls to stabilize GPU clocks before benchmarking."""
+    print(f"Warming up GPU for {duration_seconds:.1f} seconds...")
+    device = torch.device("cuda")
+    A = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+    B = torch.randn(4096, 4096, dtype=torch.bfloat16, device=device)
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    while time.time() - t0 < duration_seconds:
+        for _ in range(10):
+            torch.matmul(A, B)
+        torch.cuda.synchronize()
+
+    del A, B
+    torch.cuda.empty_cache()
+    print("GPU warmup complete.\n")
+
+
+# ---------------------------------------------------------------------------
+# Main orchestrator
+# ---------------------------------------------------------------------------
+def run_benchmarks(
+    shapes: list[tuple[int, int, int]],
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    include_fp8: bool = True,
+    include_fp4: bool = True,
+    gpu_warmup_seconds: float = 5.0,
+    pre_quantize: bool = False,
+    timing: str = "cuda-events",
+    profile_shape: Optional[int] = None,
+) -> dict[str, list[float]]:
+    """Run GEMM benchmarks for every shape and enabled precision.
+
+    Returns:
+        Dict mapping precision name to a list of TFLOPS values, one per shape.
+    """
+    results: dict[str, list[float]] = {"BF16": [], "MXFP8": [], "NVFP4": []}
+    time_results: dict[str, list[float]] = {"BF16": [], "MXFP8": [], "NVFP4": []}
+
+    has_blackwell = is_blackwell_available()
+    run_fp8 = include_fp8 and TE_AVAILABLE
+    run_fp4 = include_fp4 and TE_AVAILABLE and has_blackwell
+
+    gpu_name = torch.cuda.get_device_name(0)
+    timing_label = (
+        "torch.profiler (CUPTI kernel timestamps)" if timing == "profiler" else "CUDA events"
+    )
+
+    print(f"\nGEMM Benchmark on {gpu_name}")
+    print(f"Timing method: {timing_label}")
+    print(f"Warmup iterations: {num_warmup}, Timed iterations: {num_iters}")
+    if pre_quantize:
+        print("Mode: Pre-quantized inputs (raw kernel throughput)")
+    else:
+        print("Mode: Autocast (includes quantization overhead)")
+    if not has_blackwell and include_fp4:
+        print("Note: NVFP4 requires Blackwell (SM100+), skipping FP4 benchmarks")
+
+    if profile_shape is not None:
+        shapes = [(profile_shape, profile_shape, profile_shape)]
+        print(f"\n*** PROFILING MODE: shape {profile_shape}x{profile_shape}x{profile_shape} ***")
+        print(
+            "*** Run with: nsys profile --capture-range=cudaProfilerApi python <script> --profile"
+            " ***\n"
+        )
+
+    if gpu_warmup_seconds > 0:
+        warmup_gpu(gpu_warmup_seconds)
+
+    # Select benchmark functions
+    fp8_fn = benchmark_fp8_prequantized if pre_quantize else benchmark_fp8
+    fp4_fn = benchmark_fp4_prequantized if pre_quantize else benchmark_fp4
+
+    # Print table header
+    sep_width = 90
+    print("=" * sep_width)
+    hdr = f"{'Shape':<24} {'BF16 TFLOPS':>12} {'BF16 ms':>9}"
+    if run_fp8:
+        hdr += f" {'MXFP8 TFLOPS':>13} {'MXFP8 ms':>9}"
+    if run_fp4:
+        hdr += f" {'NVFP4 TFLOPS':>13} {'NVFP4 ms':>9}"
+    hdr += f" {'Speedup':>8}"
+    print(hdr)
+    print("-" * sep_width)
+
+    first_shape = True
+    for M, K, N in shapes:
+        shape_str = f"{M}x{K}x{N}"
+        verbose = first_shape and timing == "profiler"
+        is_profiling = profile_shape is not None
+
+        if is_profiling:
+            torch.cuda.cudart().cudaProfilerStart()
+
+        # BF16
+        if verbose:
+            print(f"\n  [{shape_str}] BF16 kernel details:")
+        if is_profiling:
+            torch.cuda.nvtx.range_push(f"BF16_{shape_str}")
+        bf16 = benchmark_bf16(M, K, N, num_warmup, num_iters, timing=timing, verbose=verbose)
+        if is_profiling:
+            torch.cuda.nvtx.range_pop()
+        results["BF16"].append(bf16.tflops)
+        time_results["BF16"].append(bf16.avg_time_ms)
+        row = f"{shape_str:<24} {bf16.tflops:>12.1f} {bf16.avg_time_ms:>9.3f}"
+        best_tflops = bf16.tflops
+
+        # MXFP8
+        if run_fp8:
+            if verbose:
+                print(f"  [{shape_str}] MXFP8 kernel details:")
+            if is_profiling:
+                torch.cuda.nvtx.range_push(f"MXFP8_{shape_str}")
+            fp8 = fp8_fn(M, K, N, num_warmup, num_iters, timing=timing, verbose=verbose)
+            if is_profiling:
+                torch.cuda.nvtx.range_pop()
+            if fp8:
+                results["MXFP8"].append(fp8.tflops)
+                time_results["MXFP8"].append(fp8.avg_time_ms)
+                row += f" {fp8.tflops:>13.1f} {fp8.avg_time_ms:>9.3f}"
+                best_tflops = max(best_tflops, fp8.tflops)
+            else:
+                results["MXFP8"].append(0)
+                time_results["MXFP8"].append(0)
+                row += f" {'N/A':>13} {'N/A':>9}"
+
+        # NVFP4
+        if run_fp4:
+            if verbose:
+                print(f"  [{shape_str}] NVFP4 kernel details:")
+            if is_profiling:
+                torch.cuda.nvtx.range_push(f"NVFP4_{shape_str}")
+            fp4 = fp4_fn(M, K, N, num_warmup, num_iters, timing=timing, verbose=verbose)
+            if is_profiling:
+                torch.cuda.nvtx.range_pop()
+            if fp4:
+                results["NVFP4"].append(fp4.tflops)
+                time_results["NVFP4"].append(fp4.avg_time_ms)
+                row += f" {fp4.tflops:>13.1f} {fp4.avg_time_ms:>9.3f}"
+                best_tflops = max(best_tflops, fp4.tflops)
+            else:
+                results["NVFP4"].append(0)
+                time_results["NVFP4"].append(0)
+                row += f" {'N/A':>13} {'N/A':>9}"
+
+        if is_profiling:
+            torch.cuda.synchronize()
+            torch.cuda.cudart().cudaProfilerStop()
+
+        speedup = best_tflops / bf16.tflops if bf16.tflops > 0 else 0.0
+        row += f" {speedup:>7.2f}x"
+
+        if verbose:
+            print()
+        print(row)
+        first_shape = False
+
+    print("=" * sep_width)
+
+    results = {k: v for k, v in results.items() if v and any(x > 0 for x in v)}
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Model-config benchmark orchestrator
+# ---------------------------------------------------------------------------
+def _benchmark_single_shape(
+    M: int,
+    K: int,
+    N: int,
+    num_warmup: int,
+    num_iters: int,
+    run_fp8: bool,
+    run_fp8_block: bool,
+    run_fp4: bool,
+    pre_quantize: bool,
+    timing: str,
+) -> dict[str, GEMMResult]:
+    """Benchmark one (M, K, N) shape across all enabled precisions.
+
+    Returns:
+        Dict mapping precision name to GEMMResult.
+    """
+    fp8_fn = benchmark_fp8_prequantized if pre_quantize else benchmark_fp8
+    fp8_block_fn = benchmark_fp8_block_prequantized if pre_quantize else benchmark_fp8_block
+    fp4_fn = benchmark_fp4_prequantized if pre_quantize else benchmark_fp4
+
+    out: dict[str, GEMMResult] = {}
+
+    bf16 = benchmark_bf16(M, K, N, num_warmup, num_iters, timing=timing)
+    out["BF16"] = bf16
+
+    if run_fp8_block:
+        fp8b = fp8_block_fn(M, K, N, num_warmup, num_iters, timing=timing)
+        if fp8b:
+            out["FP8Block"] = fp8b
+
+    if run_fp8:
+        fp8 = fp8_fn(M, K, N, num_warmup, num_iters, timing=timing)
+        if fp8:
+            out["MXFP8"] = fp8
+
+    if run_fp4:
+        fp4 = fp4_fn(M, K, N, num_warmup, num_iters, timing=timing)
+        if fp4:
+            out["NVFP4"] = fp4
+
+    return out
+
+
+def run_model_config_benchmarks(
+    config: ModelConfig,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+    include_fp8: bool = True,
+    include_fp8_block: bool = True,
+    include_fp4: bool = True,
+    gpu_warmup_seconds: float = 5.0,
+    pre_quantize: bool = False,
+    timing: str = "cuda-events",
+    output_path: str = "gemm_benchmark.png",
+    verify_dgrad: bool = False,
+) -> None:
+    """Benchmark GEMM shapes derived from model hyperparameters.
+
+    Computes Fprop and Wgrad shapes, benchmarks each across enabled
+    precisions, and prints per-layer / full-model speedup estimates.
+
+    When *verify_dgrad* is True, Dgrad shapes are benchmarked separately
+    instead of assuming Dgrad time == Fprop time.
+    """
+    has_blackwell = is_blackwell_available()
+    run_fp8 = include_fp8 and TE_AVAILABLE
+    run_fp8_block = include_fp8_block and TE_AVAILABLE
+    run_fp4 = include_fp4 and TE_AVAILABLE and has_blackwell
+
+    gpu_name = torch.cuda.get_device_name(0)
+    timing_label = (
+        "torch.profiler (CUPTI kernel timestamps)" if timing == "profiler" else "CUDA events"
+    )
+
+    M = config.micro_batch_size * config.sequence_length
+    fprop_shapes, dgrad_shapes, wgrad_shapes = compute_gemm_shapes(config)
+
+    # --- Header ---
+    sep = "=" * 90
+    dash = "-" * 90
+    print(f"\nGEMM Benchmark (Model Config Mode) on {gpu_name}")
+    print(f"Timing method: {timing_label}")
+    print(f"Warmup iterations: {num_warmup}, Timed iterations: {num_iters}")
+    if pre_quantize:
+        print("Mode: Pre-quantized inputs (raw kernel throughput)")
+    else:
+        print("Mode: Autocast (includes quantization overhead)")
+    if not has_blackwell and include_fp4:
+        print("Note: NVFP4 requires Blackwell (SM100+), skipping FP4 benchmarks")
+    print()
+    print(sep)
+    print(
+        f"Model Config: hidden={config.hidden_size}, "
+        f"intermediate={config.intermediate_size}, "
+        f"heads={config.num_attention_heads}, "
+        f"layers={config.num_hidden_layers}"
+    )
+    print(f"Tokens per step: M = {config.micro_batch_size} x {config.sequence_length} = {M:,}")
+    print(sep)
+
+    if gpu_warmup_seconds > 0:
+        warmup_gpu(gpu_warmup_seconds)
+
+    # --- Determine active precisions for column headers ---
+    precisions = ["BF16"]
+    if run_fp8_block:
+        precisions.append("FP8Block")
+    if run_fp8:
+        precisions.append("MXFP8")
+    if run_fp4:
+        precisions.append("NVFP4")
+
+    def _ms_cols(precs: list[str]) -> str:
+        return "".join(f" {p + ' ms':>10}" for p in precs)
+
+    def _ms_vals(results: dict[str, GEMMResult], precs: list[str]) -> str:
+        parts = []
+        for p in precs:
+            if p in results:
+                parts.append(f" {results[p].avg_time_ms:>10.3f}")
+            else:
+                parts.append(f" {'N/A':>10}")
+        return "".join(parts)
+
+    def _ms_sums(sums: dict[str, float], precs: list[str]) -> str:
+        return "".join(f" {sums.get(p, 0):>10.3f}" for p in precs)
+
+    # --- Benchmark Fprop shapes ---
+    fprop_header = (
+        "Fprop Shapes:"
+        if verify_dgrad
+        else "Fprop / Dgrad Shapes (each counted 2x for Fprop + Dgrad):"
+    )
+    print(f"\n{fprop_header}")
+    print(dash)
+    print(f"{'Op':<22} {'Shape':<24}{_ms_cols(precisions)}")
+    print(dash)
+
+    fprop_results: list[dict[str, GEMMResult]] = []
+    fprop_sums: dict[str, float] = {p: 0.0 for p in precisions}
+
+    for label, m, k, n in fprop_shapes:
+        shape_str = f"{m}x{k}x{n}"
+        res = _benchmark_single_shape(
+            m,
+            k,
+            n,
+            num_warmup,
+            num_iters,
+            run_fp8,
+            run_fp8_block,
+            run_fp4,
+            pre_quantize,
+            timing,
+        )
+        fprop_results.append(res)
+        for p in precisions:
+            if p in res:
+                fprop_sums[p] += res[p].avg_time_ms
+        print(f"{label:<22} {shape_str:<24}{_ms_vals(res, precisions)}")
+
+    print(dash)
+    print(f"{'Fprop sum (ms):':<46}{_ms_sums(fprop_sums, precisions)}")
+
+    # Default: assume Dgrad == Fprop.  Overwritten below if --verify-dgrad.
+    fprop_dgrad_sums: dict[str, float] = {p: v * 2 for p, v in fprop_sums.items()}
+
+    if not verify_dgrad:
+        print(f"{'Fprop + Dgrad (x2):':<46}{_ms_sums(fprop_dgrad_sums, precisions)}")
+
+    # --- Benchmark Dgrad shapes (optional) ---
+    dgrad_results: list[dict[str, GEMMResult]] = []
+    dgrad_sums: dict[str, float] = {p: 0.0 for p in precisions}
+
+    if verify_dgrad:
+        print(f"\nDgrad Shapes:")
+        print(dash)
+        print(f"{'Op':<22} {'Shape':<24}{_ms_cols(precisions)}")
+        print(dash)
+
+        for label, m, k, n in dgrad_shapes:
+            shape_str = f"{m}x{k}x{n}"
+            res = _benchmark_single_shape(
+                m,
+                k,
+                n,
+                num_warmup,
+                num_iters,
+                run_fp8,
+                run_fp8_block,
+                run_fp4,
+                pre_quantize,
+                timing,
+            )
+            dgrad_results.append(res)
+            for p in precisions:
+                if p in res:
+                    dgrad_sums[p] += res[p].avg_time_ms
+            print(f"{label:<22} {shape_str:<24}{_ms_vals(res, precisions)}")
+
+        print(dash)
+        print(f"{'Dgrad sum (ms):':<46}{_ms_sums(dgrad_sums, precisions)}")
+
+        fprop_dgrad_sums = {p: fprop_sums.get(p, 0) + dgrad_sums.get(p, 0) for p in precisions}
+        fprop_dgrad_assumed = {p: v * 2 for p, v in fprop_sums.items()}
+        print(f"{'Fprop + Dgrad (measured):':<46}{_ms_sums(fprop_dgrad_sums, precisions)}")
+        print(f"{'Fprop + Dgrad (assumed x2):':<46}{_ms_sums(fprop_dgrad_assumed, precisions)}")
+
+        print(f"\nFprop vs Dgrad per-shape comparison:")
+        print(dash)
+        for p in precisions:
+            print(f"  {p}:")
+            for i, ((fp_label, *_), (dg_label, *_)) in enumerate(zip(fprop_shapes, dgrad_shapes)):
+                fp_res = fprop_results[i].get(p)
+                dg_res = dgrad_results[i].get(p)
+                if fp_res and dg_res:
+                    fp_ms = fp_res.avg_time_ms
+                    dg_ms = dg_res.avg_time_ms
+                    diff_pct = (dg_ms - fp_ms) / fp_ms * 100
+                    print(
+                        f"    {fp_label:<16} Fprop={fp_ms:7.3f}ms  Dgrad={dg_ms:7.3f}ms "
+                        f" diff={diff_pct:+.1f}%"
+                    )
+            fp_total = fprop_sums.get(p, 0)
+            dg_total = dgrad_sums.get(p, 0)
+            if fp_total > 0:
+                total_diff = (dg_total - fp_total) / fp_total * 100
+                print(
+                    f"    {'Sum':<16} Fprop={fp_total:7.3f}ms  Dgrad={dg_total:7.3f}ms "
+                    f" diff={total_diff:+.1f}%"
+                )
+
+    # --- Benchmark Wgrad shapes ---
+    print(f"\nWgrad Shapes:")
+    print(dash)
+    print(f"{'Op':<22} {'Shape':<24}{_ms_cols(precisions)}")
+    print(dash)
+
+    wgrad_results: list[dict[str, GEMMResult]] = []
+    wgrad_sums: dict[str, float] = {p: 0.0 for p in precisions}
+
+    for label, m, k, n in wgrad_shapes:
+        shape_str = f"{m}x{k}x{n}"
+        res = _benchmark_single_shape(
+            m,
+            k,
+            n,
+            num_warmup,
+            num_iters,
+            run_fp8,
+            run_fp8_block,
+            run_fp4,
+            pre_quantize,
+            timing,
+        )
+        wgrad_results.append(res)
+        for p in precisions:
+            if p in res:
+                wgrad_sums[p] += res[p].avg_time_ms
+        print(f"{label:<22} {shape_str:<24}{_ms_vals(res, precisions)}")
+
+    print(dash)
+    print(f"{'Wgrad sum (ms):':<46}{_ms_sums(wgrad_sums, precisions)}")
+
+    # --- Per-layer and full-model summary ---
+    per_layer = {p: fprop_dgrad_sums.get(p, 0) + wgrad_sums.get(p, 0) for p in precisions}
+    full_model = {p: v * config.num_hidden_layers for p, v in per_layer.items()}
+
+    print(f"\n{sep}")
+    print("Per-Layer GEMM Time:")
+    print(f"{'':>30}{_ms_cols(precisions)}")
+    if verify_dgrad:
+        print(f"{'Fprop:':<30}{_ms_sums(fprop_sums, precisions)}")
+        print(f"{'Dgrad (measured):':<30}{_ms_sums(dgrad_sums, precisions)}")
+    fd_label = "Fprop + Dgrad (measured):" if verify_dgrad else "Fprop + Dgrad (x2):"
+    print(f"{fd_label:<30}{_ms_sums(fprop_dgrad_sums, precisions)}")
+    print(f"{'Wgrad:':<30}{_ms_sums(wgrad_sums, precisions)}")
+    print(f"{'Per-layer total:':<30}{_ms_sums(per_layer, precisions)}")
+
+    if verify_dgrad:
+        assumed_per_layer = {p: fprop_sums.get(p, 0) * 2 + wgrad_sums.get(p, 0) for p in precisions}
+        print(f"{'Per-layer (assumed x2):':<30}{_ms_sums(assumed_per_layer, precisions)}")
+        approx_err = {
+            p: (
+                (per_layer[p] - assumed_per_layer[p]) / assumed_per_layer[p] * 100
+                if assumed_per_layer.get(p, 0) > 0
+                else 0
+            )
+            for p in precisions
+        }
+        err_str = "".join(f" {approx_err.get(p, 0):>+9.1f}%" for p in precisions)
+        print(f"{'Approximation error:':<30}{err_str}")
+
+    print(f"\nFull Model ({config.num_hidden_layers} layers):")
+    print(f"{'Total GEMM time (ms):':<30}{_ms_sums(full_model, precisions)}")
+
+    print(f"\nEstimated GEMM Speedups:")
+    bf16_total = full_model.get("BF16", 0)
+    if run_fp8 and bf16_total > 0:
+        fp8_total = full_model.get("MXFP8", 0)
+        if fp8_total > 0:
+            print(f"  MXFP8 vs BF16:  {bf16_total / fp8_total:.2f}x")
+    if run_fp4 and run_fp8:
+        fp8_total = full_model.get("MXFP8", 0)
+        fp4_total = full_model.get("NVFP4", 0)
+        if fp8_total > 0 and fp4_total > 0:
+            print(f"  NVFP4 vs MXFP8: {fp8_total / fp4_total:.2f}x")
+    if run_fp4 and bf16_total > 0:
+        fp4_total = full_model.get("NVFP4", 0)
+        if fp4_total > 0:
+            print(f"  NVFP4 vs BF16:  {bf16_total / fp4_total:.2f}x")
+    print(sep)
+
+    # --- Plot ---
+    create_model_config_plot(
+        config,
+        fprop_results,
+        wgrad_results,
+        fprop_shapes,
+        wgrad_shapes,
+        precisions,
+        output_path,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Plotting
+# ---------------------------------------------------------------------------
+def create_plot(
+    shapes: list[tuple[int, int, int]],
+    results: dict[str, list[float]],
+    output_path: str = "gemm_benchmark.png",
+    title: Optional[str] = None,
+) -> Optional[tuple]:
+    """Create a grouped bar chart of TFLOPS by precision and save to *output_path*."""
+    if not results:
+        print("No results to plot.")
+        return None
+
+    gpu_name = torch.cuda.get_device_name(0)
+    if title is None:
+        title = f"Absolute Performance Comparison\nMeasured on {gpu_name}"
+
+    labels = [f"{m}x{k}x{n}" for m, k, n in shapes]
+    x = np.arange(len(labels))
+
+    num_bars = len(results)
+    bar_width = 0.8 / num_bars
+
+    fig, ax = plt.subplots(figsize=(14, 8))
+
+    for i, (prec, tflops_list) in enumerate(results.items()):
+        offset = (i - num_bars / 2 + 0.5) * bar_width
+        color = PRECISION_COLORS.get(prec, f"C{i}")
+        ax.bar(x + offset, tflops_list, bar_width, label=prec, color=color)
+
+    ax.set_xlabel("Matrix Shape (MxKxN)", fontsize=12)
+    ax.set_ylabel("Performance (TFLOPS)", fontsize=12)
+    ax.set_title(title, fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=10)
+    ax.legend(title="Kernel", loc="upper left", fontsize=10)
+    ax.yaxis.grid(True, linestyle="--", alpha=0.7)
+    ax.set_axisbelow(True)
+    ax.set_ylim(bottom=0)
+    plt.tight_layout()
+
+    out = Path(output_path)
+    supported = set(fig.canvas.get_supported_filetypes().keys())
+    suffix = out.suffix.lower().lstrip(".")
+    if suffix not in supported:
+        out = out.with_suffix(".png")
+        print(f"Warning: '.{suffix}' not supported by matplotlib; saving to '{out}' instead.")
+
+    plt.savefig(str(out), dpi=150, bbox_inches="tight")
+    print(f"\nPlot saved to: {out}")
+    return fig, ax
+
+
+def create_model_config_plot(
+    config: ModelConfig,
+    fprop_results: list[dict[str, GEMMResult]],
+    wgrad_results: list[dict[str, GEMMResult]],
+    fprop_shapes: list[tuple[str, int, int, int]],
+    wgrad_shapes: list[tuple[str, int, int, int]],
+    precisions: list[str],
+    output_path: str = "gemm_benchmark.png",
+) -> Optional[tuple]:
+    """Create a stacked bar chart of per-layer GEMM time by precision and op."""
+    gpu_name = torch.cuda.get_device_name(0)
+    op_labels = ["QKV Proj", "Attn Out", "MLP Up", "MLP Down"]
+    op_colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2"]
+
+    fig, ax = plt.subplots(figsize=(10, 7))
+    bar_width = 0.5
+    x = np.arange(len(precisions))
+
+    for i, (op_label, op_color) in enumerate(zip(op_labels, op_colors)):
+        fprop_ms = []
+        wgrad_ms = []
+        for p in precisions:
+            fp = fprop_results[i].get(p)
+            fprop_ms.append(fp.avg_time_ms * 2 if fp else 0)  # Fprop + Dgrad
+            wg = wgrad_results[i].get(p)
+            wgrad_ms.append(wg.avg_time_ms if wg else 0)
+
+        # Compute bottoms from prior ops
+        fprop_bottom = np.zeros(len(precisions))
+        wgrad_bottom = np.zeros(len(precisions))
+        for j in range(i):
+            for k, p in enumerate(precisions):
+                fp_prev = fprop_results[j].get(p)
+                fprop_bottom[k] += (fp_prev.avg_time_ms * 2) if fp_prev else 0
+                wg_prev = wgrad_results[j].get(p)
+                wgrad_bottom[k] += wg_prev.avg_time_ms if wg_prev else 0
+
+        total_fprop_bottom = fprop_bottom
+        total_wgrad_bottom = wgrad_bottom
+        # Wgrad stacks on top of all Fprop+Dgrad
+        all_fprop_total = np.zeros(len(precisions))
+        for j in range(len(op_labels)):
+            for k, p in enumerate(precisions):
+                fp = fprop_results[j].get(p)
+                all_fprop_total[k] += (fp.avg_time_ms * 2) if fp else 0
+
+        ax.bar(
+            x,
+            fprop_ms,
+            bar_width,
+            bottom=total_fprop_bottom,
+            color=op_color,
+            alpha=0.9,
+            label=f"{op_label} (Fprop+Dgrad)" if i == 0 or True else "",
+        )
+        ax.bar(
+            x,
+            wgrad_ms,
+            bar_width,
+            bottom=all_fprop_total + total_wgrad_bottom,
+            color=op_color,
+            alpha=0.5,
+            label=f"{op_label} (Wgrad)" if i == 0 or True else "",
+        )
+
+    # Speedup annotations
+    totals = []
+    for k, p in enumerate(precisions):
+        total = 0
+        for i in range(len(op_labels)):
+            fp = fprop_results[i].get(p)
+            total += (fp.avg_time_ms * 2) if fp else 0
+            wg = wgrad_results[i].get(p)
+            total += wg.avg_time_ms if wg else 0
+        totals.append(total)
+
+    bf16_total = totals[0] if totals else 0
+    for k, (p, total) in enumerate(zip(precisions, totals)):
+        if bf16_total > 0 and total > 0:
+            speedup = bf16_total / total
+            ax.text(
+                x[k],
+                total + 0.02 * max(totals),
+                f"{speedup:.2f}x",
+                ha="center",
+                va="bottom",
+                fontweight="bold",
+                fontsize=11,
+            )
+
+    ax.set_xlabel("Precision", fontsize=12)
+    ax.set_ylabel("Per-Layer GEMM Time (ms)", fontsize=12)
+    ax.set_title(
+        f"Per-Layer GEMM Time Breakdown\n{gpu_name} | "
+        f"hidden={config.hidden_size}, layers={config.num_hidden_layers}",
+        fontsize=13,
+        fontweight="bold",
+    )
+    ax.set_xticks(x)
+    ax.set_xticklabels(precisions, fontsize=11)
+    ax.legend(
+        loc="upper right",
+        fontsize=8,
+        ncol=2,
+        title="Operation (pass)",
+    )
+    ax.yaxis.grid(True, linestyle="--", alpha=0.7)
+    ax.set_axisbelow(True)
+    ax.set_ylim(bottom=0)
+    plt.tight_layout()
+
+    out = Path(output_path)
+    supported = set(fig.canvas.get_supported_filetypes().keys())
+    suffix = out.suffix.lower().lstrip(".")
+    if suffix not in supported:
+        out = out.with_suffix(".png")
+
+    plt.savefig(str(out), dpi=150, bbox_inches="tight")
+    print(f"\nPlot saved to: {out}")
+    return fig, ax
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def main() -> int:
+    """Entry-point for the unified GEMM benchmark."""
+    parser = argparse.ArgumentParser(
+        description="Unified GEMM benchmark for BF16 / MXFP8 / NVFP4 precisions",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        default="gemm_benchmark.png",
+        help="Output plot path (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--num-warmup",
+        type=int,
+        default=10,
+        help="Per-kernel warmup iterations (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=100, help="Timed iterations (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--gpu-warmup",
+        type=float,
+        default=5.0,
+        help=(
+            "Seconds of sustained matmuls to stabilize GPU clocks (default: %(default)s, 0 to"
+            " disable)"
+        ),
+    )
+    parser.add_argument("--no-fp8", action="store_true", help="Skip MXFP8 benchmarks")
+    parser.add_argument(
+        "--no-fp8-block", action="store_true", help="Skip Float8BlockScaling benchmarks"
+    )
+    parser.add_argument("--no-fp4", action="store_true", help="Skip NVFP4 benchmarks")
+    parser.add_argument(
+        "--pre-quantize",
+        action="store_true",
+        help="Use pre-quantized inputs (tex.generic_gemm) instead of te.Linear autocast",
+    )
+    parser.add_argument(
+        "--verify-dgrad",
+        action="store_true",
+        help=(
+            "Benchmark Dgrad shapes separately instead of assuming Dgrad time == Fprop time. "
+            "Prints a per-shape comparison and uses measured Dgrad times in the summary."
+        ),
+    )
+    parser.add_argument(
+        "--timing",
+        choices=["cuda-events", "profiler"],
+        default="cuda-events",
+        help=(
+            "Timing back-end: 'cuda-events' uses CUDA event pairs with a leading-kernel trick;"
+            " 'profiler' uses torch.profiler to extract GEMM-kernel-only times (default:"
+            " %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--shapes",
+        default=None,
+        help=(
+            "Comma-separated GEMM shapes.  Square sizes like '1024,2048,4096' or "
+            "explicit MxKxN triplets like '8192x5120x10240,8192x10240x5120'"
+        ),
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help=(
+            "Nsight profiling mode: benchmark only --profile-shape, emit CUDA profiler "
+            "start/stop and NVTX ranges.  Run with: nsys profile "
+            "--capture-range=cudaProfilerApi python benchmarks/gemm/benchmark_gemm.py --profile"
+        ),
+    )
+    parser.add_argument(
+        "--profile-shape",
+        type=int,
+        default=4096,
+        help="Square matrix size used in --profile mode (default: %(default)s)",
+    )
+
+    # Model configuration arguments
+    model_group = parser.add_argument_group(
+        "Model Configuration",
+        "Specify model hyperparameters to automatically derive and benchmark "
+        "all GEMM shapes for a transformer model.  Mutually exclusive with --shapes.",
+    )
+    model_group.add_argument(
+        "--hidden_size", type=int, default=None, help="Hidden dimension of the model"
+    )
+    model_group.add_argument(
+        "--intermediate_size", type=int, default=None, help="MLP intermediate dimension"
+    )
+    model_group.add_argument(
+        "--num_attention_heads", type=int, default=None, help="Number of attention heads"
+    )
+    model_group.add_argument(
+        "--num_hidden_layers", type=int, default=None, help="Number of transformer layers"
+    )
+    model_group.add_argument("--micro_batch_size", type=int, default=None, help="Micro batch size")
+    model_group.add_argument("--sequence_length", type=int, default=None, help="Sequence length")
+
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        print("Error: CUDA is not available.  This script requires a GPU.")
+        return 1
+
+    if not TE_AVAILABLE:
+        print("Warning: Transformer Engine not available. FP8/FP4 benchmarks will be skipped.")
+
+    # Detect model-config mode
+    model_config_names = [
+        "hidden_size",
+        "intermediate_size",
+        "num_attention_heads",
+        "num_hidden_layers",
+        "micro_batch_size",
+        "sequence_length",
+    ]
+    model_config_vals = {name: getattr(args, name) for name in model_config_names}
+    has_model_config = any(v is not None for v in model_config_vals.values())
+    has_shapes = args.shapes is not None
+
+    if has_model_config and has_shapes:
+        parser.error("--shapes and model config arguments are mutually exclusive.")
+
+    if has_model_config:
+        missing = [n for n, v in model_config_vals.items() if v is None]
+        if missing:
+            parser.error(
+                f"Model config mode requires all of: {', '.join(model_config_names)}. "
+                f"Missing: {', '.join(missing)}"
+            )
+
+        config = ModelConfig(
+            hidden_size=args.hidden_size,
+            intermediate_size=args.intermediate_size,
+            num_attention_heads=args.num_attention_heads,
+            num_hidden_layers=args.num_hidden_layers,
+            micro_batch_size=args.micro_batch_size,
+            sequence_length=args.sequence_length,
+        )
+        run_model_config_benchmarks(
+            config=config,
+            num_warmup=args.num_warmup,
+            num_iters=args.num_iters,
+            include_fp8=not args.no_fp8,
+            include_fp8_block=not args.no_fp8_block,
+            include_fp4=not args.no_fp4,
+            gpu_warmup_seconds=args.gpu_warmup,
+            pre_quantize=args.pre_quantize,
+            timing=args.timing,
+            output_path=args.output,
+            verify_dgrad=args.verify_dgrad,
+        )
+    else:
+        shapes = parse_shapes_arg(args.shapes) if args.shapes else get_default_shapes()
+        prof_shape = args.profile_shape if args.profile else None
+
+        results = run_benchmarks(
+            shapes=shapes,
+            num_warmup=args.num_warmup,
+            num_iters=args.num_iters,
+            include_fp8=not args.no_fp8,
+            include_fp4=not args.no_fp4,
+            gpu_warmup_seconds=args.gpu_warmup,
+            pre_quantize=args.pre_quantize,
+            timing=args.timing,
+            profile_shape=prof_shape,
+        )
+
+        if not args.profile:
+            create_plot(shapes, results, args.output)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/features/low_precision_training/gemm_profiling/gemm_profiling.rst b/docs/features/low_precision_training/gemm_profiling/gemm_profiling.rst
new file mode 100644
index 0000000000..6f17dc2aae
--- /dev/null
+++ b/docs/features/low_precision_training/gemm_profiling/gemm_profiling.rst
@@ -0,0 +1,579 @@
+..
+    Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+GEMM Profiling Across Precisions
+===================================
+
+This guide shows how to go from a transformer model config to concrete GEMM shapes,
+benchmark them across precisions (BF16, FP8 Block, MXFP8, NVFP4), and compute
+expected speedups. If you are using NVIDIA Transformer Engine -- which handles the
+quantization and kernel dispatch for these precision modes -- this is how you derive
+the matrix multiplications your model runs and measure where your time goes.
+
+A companion benchmark tool is provided at ``benchmarks/gemm/benchmark_gemm.py``.
+
+
+Quick Start: Model Config Mode
+-------------------------------
+
+The benchmark tool takes model hyperparameters directly and handles everything --
+deriving GEMM shapes, benchmarking across precisions, and computing the full
+speedup analysis -- in a single command:
+
+.. code-block:: bash
+
+    python benchmarks/gemm/benchmark_gemm.py \
+      --hidden_size 4096 \
+      --intermediate_size 16384 \
+      --num_attention_heads 32 \
+      --num_hidden_layers 24 \
+      --micro_batch_size 31 \
+      --sequence_length 512 \
+      -o ./gemm_speedup.png
+
+By default the tool runs in **autocast mode**, which is what Transformer Engine does
+during training: inputs are dynamically quantized to the target precision before each
+GEMM, so the measured time includes both the quantization cost and the GEMM kernel
+itself. This gives the realistic end-to-end picture.
+
+The tool computes ``M = 31 x 512 = 15,872`` tokens, derives all 12 GEMM shapes
+(4 Fprop + 4 Dgrad + 4 Wgrad), benchmarks each across BF16, FP8 Block, MXFP8,
+and NVFP4, and prints the full results.
+
+By default, Dgrad times are approximated as equal to Fprop (since the FLOP count
+is the same -- just with K and N swapped). To benchmark Dgrad shapes directly,
+add ``--verify-dgrad``:
+
+.. code-block:: bash
+
+    python benchmarks/gemm/benchmark_gemm.py \
+      --hidden_size 4096 \
+      --intermediate_size 16384 \
+      --num_attention_heads 32 \
+      --num_hidden_layers 24 \
+      --micro_batch_size 31 \
+      --sequence_length 512 \
+      --verify-dgrad \
+      -o ./gemm_speedup.png
+
+This benchmarks Fprop, Dgrad, and Wgrad shapes separately and prints a per-shape
+comparison showing exactly how Dgrad times differ from Fprop:
+
+.. code-block:: text
+
+    GEMM Benchmark (Model Config Mode) on NVIDIA B300 SXM6 AC
+    Timing method: CUDA events
+    Warmup iterations: 10, Timed iterations: 100
+    Mode: Autocast (includes quantization overhead)
+
+    ==========================================================================================
+    Model Config: hidden=4096, intermediate=16384, heads=32, layers=24
+    Tokens per step: M = 31 x 512 = 15,872
+    ==========================================================================================
+
+    Fprop Shapes:
+    ------------------------------------------------------------------------------------------
+    Op                     Shape                       BF16 ms FP8Block ms   MXFP8 ms   NVFP4 ms
+    ------------------------------------------------------------------------------------------
+    QKV Proj               15872x4096x12288              1.018      0.611      0.544      0.390
+    Attn Out               15872x4096x4096               0.296      0.324      0.265      0.253
+    MLP Up                 15872x4096x16384              1.364      0.906      0.873      0.636
+    MLP Down               15872x16384x4096              1.378      1.150      1.011      0.665
+    ------------------------------------------------------------------------------------------
+    Fprop sum (ms):                                     4.056      2.992      2.692      1.944
+
+    ==========================================================================================
+    Per-Layer GEMM Time:
+                                      BF16 ms FP8Block ms   MXFP8 ms   NVFP4 ms
+    Fprop:                              4.056      2.992      2.692      1.944
+    Dgrad (measured):                   4.101      3.290      2.891      2.183
+    Fprop + Dgrad (measured):           8.158      6.282      5.583      4.127
+    Wgrad:                              4.070      3.317      2.924      2.312
+    Per-layer total:                   12.227      9.598      8.507      6.439
+
+    Full Model (24 layers):
+    Total GEMM time (ms):             293.454    230.363    204.166    154.528
+
+    Estimated GEMM Speedups:
+      MXFP8 vs BF16:  1.44x
+      NVFP4 vs MXFP8: 1.32x
+      NVFP4 vs BF16:  1.90x
+    ==========================================================================================
+
+.. figure:: img/model_config_speedup.png
+   :align: center
+   :width: 80%
+   :alt: Autocast model config benchmark showing per-layer GEMM time breakdown across precisions.
+
+   Autocast model config benchmark on NVIDIA B300 -- per-layer GEMM time breakdown by
+   precision and operation (Fprop+Dgrad and Wgrad).
+
+
+Autocast vs Pre-quantized
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To isolate raw GEMM kernel performance, add ``--pre-quantize``. This pre-quantizes all
+inputs once before the timed loop, so the measured time reflects only the GEMM kernel
+execution -- no dynamic quantization, no block scaling computation, no format conversion
+during the timed region.
+
+.. code-block:: bash
+
+    python benchmarks/gemm/benchmark_gemm.py \
+      --hidden_size 4096 \
+      --intermediate_size 16384 \
+      --num_attention_heads 32 \
+      --num_hidden_layers 24 \
+      --micro_batch_size 31 \
+      --sequence_length 512 \
+      --pre-quantize \
+      -o ./gemm_speedup_prequant.png
+
+.. code-block:: text
+
+    ==========================================================================================
+    Per-Layer GEMM Time:
+                                      BF16 ms FP8Block ms   MXFP8 ms   NVFP4 ms
+    Fprop:                              4.146      2.264      2.391      1.273
+    Dgrad (measured):                   4.318      2.323      2.448      1.297
+    Fprop + Dgrad (measured):           8.463      4.587      4.839      2.570
+    Wgrad:                              4.372      2.243      2.485      1.184
+    Per-layer total:                   12.835      6.830      7.324      3.754
+
+    Full Model (24 layers):
+    Total GEMM time (ms):             308.042    163.908    175.781     90.094
+
+    Estimated GEMM Speedups:
+      MXFP8 vs BF16:  1.75x
+      NVFP4 vs MXFP8: 1.95x
+      NVFP4 vs BF16:  3.42x
+    ==========================================================================================
+
+.. figure:: img/model_config_speedup_prequant.png
+   :align: center
+   :width: 80%
+   :alt: Pre-quantized model config benchmark showing raw GEMM kernel throughput.
+
+   Pre-quantized model config benchmark -- raw GEMM kernel throughput without
+   quantization overhead.
+
+Comparing the two tells you exactly how much quantization overhead costs: NVFP4 vs
+BF16 goes from 1.90x (autocast) to 3.42x (kernel-only). The gap between these two
+numbers is the overhead from dynamic quantization, Hadamard transforms, and block
+scaling that occurs in each training step.
+
+An interesting result: **FP8 Block Scaling beats MXFP8 in raw kernel throughput**
+(6.830 ms vs 7.324 ms per layer), even though MXFP8 is faster in autocast mode
+(8.507 ms vs 9.598 ms). This tells us FP8 Block's nvjet kernels are actually more
+efficient, but its autocast quantization overhead (block-wise scaling computation)
+is higher than MXFP8's microscaling overhead.
+
+**When to use which:** Use autocast results for predicting real training speedups --
+that is what Transformer Engine actually does during training. Use pre-quantized results
+to understand whether quantization overhead is the bottleneck, or to compare raw tensor
+core throughput across precisions independent of the quantization implementation.
+
+
+How the Shapes Are Derived
+---------------------------
+
+.. note::
+
+   This section is reference material -- the tool handles all of this automatically.
+   Read on if you want to understand the mechanics behind the shape derivation
+   and speedup calculation.
+
+The first thing to establish is **M** -- the token dimension. Every linear layer in a
+transformer operates on a 2D matrix of shape ``[tokens, features]``, where
+``tokens = micro_batch_size * sequence_length``. For the example config:
+
+.. code-block:: text
+
+    M = 31 x 512 = 15,872
+
+This is the batch dimension for every single GEMM in a forward or backward pass through
+one layer. It stays constant across all ops.
+
+
+The Linear Layer Convention
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every linear layer computes ``Y = X @ W``, which is a matrix multiply ``C = A x B``
+where:
+
+- **A** is the activation: ``[M, K]``
+- **B** is the weight: ``[K, N]``
+- **C** is the output: ``[M, N]``
+
+The mapping is:
+
+.. table::
+   :align: center
+
+   =========== ============================================================
+   Symbol      Meaning
+   =========== ============================================================
+   **M**       Number of tokens (``micro_batch_size * sequence_length``)
+   **K**       Input feature dimension (contracted/summed over)
+   **N**       Output feature dimension
+   =========== ============================================================
+
+Your model config gives you K and N. Your batch config gives you M. That is all
+you need.
+
+.. note::
+
+   Throughout this guide and in the tool's output, GEMM shapes are written as
+   **MxKxN** -- tokens x input features x output features. The ``--shapes`` flag
+   uses the same ordering.
+
+
+Forward Pass GEMMs
+^^^^^^^^^^^^^^^^^^^
+
+A standard transformer layer has four major linear projections.
+
+**1. QKV Projection**
+
+Projects the input into queries, keys, and values as a single fused linear layer:
+
+- Input features (K) = ``hidden_size`` = 4096
+- Output features (N) = 3 x ``hidden_size`` = 12,288
+
+.. code-block:: text
+
+    Y = X @ W_qkv
+    [15872, 4096] x [4096, 12288] -> [15872, 12288]
+
+    M = 15,872    K = 4,096    N = 12,288
+
+**2. Attention Output Projection**
+
+After attention, project back to the hidden dimension:
+
+- Input features (K) = ``hidden_size`` = 4096
+- Output features (N) = ``hidden_size`` = 4096
+
+.. code-block:: text
+
+    Y = X @ W_out
+    [15872, 4096] x [4096, 4096] -> [15872, 4096]
+
+    M = 15,872    K = 4,096    N = 4,096
+
+**3. MLP Up Projection (Gate + Up)**
+
+The MLP first projects up to the intermediate dimension. In gated architectures
+(SwiGLU, etc.), this is typically fused into a single projection:
+
+- Input features (K) = ``hidden_size`` = 4096
+- Output features (N) = ``intermediate_size`` = 16,384
+
+.. code-block:: text
+
+    Y = X @ W_up
+    [15872, 4096] x [4096, 16384] -> [15872, 16384]
+
+    M = 15,872    K = 4,096    N = 16,384
+
+**4. MLP Down Projection**
+
+Projects back from intermediate dimension to hidden dimension:
+
+- Input features (K) = ``intermediate_size`` = 16,384
+- Output features (N) = ``hidden_size`` = 4096
+
+.. code-block:: text
+
+    Y = X @ W_down
+    [15872, 16384] x [16384, 4096] -> [15872, 4096]
+
+    M = 15,872    K = 16,384    N = 4,096
+
+
+Forward Summary
+""""""""""""""""
+
+.. table::
+   :align: center
+
+   ===============  =======  ======  =====  ======  ================  ==============
+   Op               Pass     M       K      N       Shape (MxKxN)     FLOPs (2*M*K*N)
+   ===============  =======  ======  =====  ======  ================  ==============
+   QKV proj         Forward  15,872  4,096  12,288  15872x4096x12288  ~1.60T
+   Attn out proj    Forward  15,872  4,096  4,096   15872x4096x4096   ~0.53T
+   MLP up           Forward  15,872  4,096  16,384  15872x4096x16384  ~2.13T
+   MLP down         Forward  15,872  16,384 4,096   15872x16384x4096  ~2.13T
+   **Total/layer**                                                    **~6.39T**
+   ===============  =======  ======  =====  ======  ================  ==============
+
+
+Backward Pass GEMMs
+^^^^^^^^^^^^^^^^^^^^
+
+The backward pass through each linear layer produces two GEMMs: one for the gradient
+with respect to the input (**dX**), and one for the gradient with respect to the
+weights (**dW**).
+
+Given forward ``Y = X @ W`` where X is ``[M, K]`` and W is ``[K, N]``:
+
+**dX = dY @ W^T** --
+The gradient flows back through the transposed weight matrix. The contraction axis
+is now N (the output features from the forward pass):
+
+.. code-block:: text
+
+    M = tokens    K = out_features (N from forward)    N = in_features (K from forward)
+
+**dW = X^T @ dY** --
+The weight gradient contracts over the token dimension:
+
+.. code-block:: text
+
+    M = in_features    K = tokens    N = out_features
+
+
+Full Backward Table
+""""""""""""""""""""
+
+.. table::
+   :align: center
+
+   =========  ==============  ======  ======  ======  ================
+   Op         Pass            M       K       N       Shape (MxKxN)
+   =========  ==============  ======  ======  ======  ================
+   QKV proj   Backward (dX)   15,872  12,288  4,096   15872x12288x4096
+   QKV proj   Backward (dW)   4,096   15,872  12,288  4096x15872x12288
+   Attn out   Backward (dX)   15,872  4,096   4,096   15872x4096x4096
+   Attn out   Backward (dW)   4,096   15,872  4,096   4096x15872x4096
+   MLP up     Backward (dX)   15,872  16,384  4,096   15872x16384x4096
+   MLP up     Backward (dW)   4,096   15,872  16,384  4096x15872x16384
+   MLP down   Backward (dX)   15,872  4,096   16,384  15872x4096x16384
+   MLP down   Backward (dW)   16,384  15,872  4,096   16384x15872x4096
+   =========  ==============  ======  ======  ======  ================
+
+
+Total FLOP Budget
+^^^^^^^^^^^^^^^^^^
+
+Each backward GEMM has the same FLOPs as its corresponding forward GEMM (the dimensions
+are just rearranged), and there are two per op, so:
+
+.. code-block:: text
+
+    Per layer: ~6.39T (fwd) + ~12.78T (bwd) = ~19.17 TFLOPS
+    Full model (24 layers): ~460 TFLOPS per step
+
+
+What Precision Does Each GEMM Run At?
+--------------------------------------
+
+Before plugging shapes into a benchmark, it is worth understanding what precision each
+GEMM actually runs at in each Transformer Engine mode. Each linear layer has three GEMMs:
+
+.. table::
+   :align: center
+
+   ==============  ============  ============
+   Pass            Operand A     Operand B
+   ==============  ============  ============
+   Forward (Fprop) activations   weights
+   dX (Dgrad)      gradients     weights^T
+   dW (Wgrad)      activations^T gradients
+   ==============  ============  ============
+
+According to the `NVFP4 training paper <https://arxiv.org/abs/2509.25149>`__,
+in NVFP4 mode **all three GEMMs quantize both operands to NVFP4** -- not just the
+weight-touching ones. The Wgrad GEMM quantizes the saved activations and incoming
+gradients to NVFP4 as well, with stochastic rounding applied to gradients and
+Random Hadamard transforms on both Wgrad inputs.
+
+.. table:: Actual precision per GEMM in each TE mode
+   :align: center
+
+   ======  ===========  =============================================  ==============================  ============================================
+   Pass    BF16 mode    FP8 Block mode                                 MXFP8 mode                      NVFP4 mode
+   ======  ===========  =============================================  ==============================  ============================================
+   Fprop   BF16         FP8 (block-scaled weights + activations)       FP8 (weights + activations)     FP4 (weights + activations)
+   Dgrad   BF16         FP8 (block-scaled weights + gradients)         FP8 (weights + gradients)       FP4 (weights + gradients, with SR on grads)
+   Wgrad   BF16         FP8 (block-scaled activations + gradients)     FP8 (activations + gradients)   FP4 (activations + gradients, with SR + RHT)
+   ======  ===========  =============================================  ==============================  ============================================
+
+The key takeaway: **all 12 GEMMs per layer benefit from each precision step.** Moving
+from BF16 to MXFP8 speeds up all 12 GEMMs (4 Fprop + 4 Dgrad + 4 Wgrad). Moving from
+MXFP8 to NVFP4 also speeds up all 12 -- there is no fixed-cost dilution from dW running
+at the same precision in both configs.
+
+
+Understanding the Speedup Calculation
+--------------------------------------
+
+The key insight: **do not multiply per-GEMM speedups -- sum execution times and divide
+totals.** The forward and backward passes run sequentially, not compounded. The tool
+handles this automatically, but here is what it does:
+
+Each linear layer has three GEMM passes: Fprop, Dgrad, and Wgrad. The Dgrad GEMM
+(``dX = dY @ W^T``) has the same FLOP count as Fprop but with K and N swapped.
+Wgrad shapes have a different aspect ratio -- the token dimension moves from M to K --
+so they must always be benchmarked separately.
+
+By default, the tool approximates Dgrad time as equal to Fprop time (since the FLOP
+count is the same), giving a per-layer total of ``(Fprop sum x 2) + Wgrad sum``.
+However, swapping K and N changes the matrix aspect ratio, which can affect kernel
+selection and memory access patterns. To measure this directly, use ``--verify-dgrad`` --
+this benchmarks Dgrad shapes separately and reports the actual difference.
+
+
+Speedup Is Shape-Dependent
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An important caveat: **the speedup from lower-precision GEMMs depends directly on
+the matrix dimensions, which are determined by your model config.** Larger matrices
+amortize the fixed overhead of quantization (format conversion, block scaling,
+Hadamard transforms) over more compute, so they benefit more from FP8 and FP4.
+Smaller matrices may see zero benefit -- or even a slowdown -- because the quantization
+overhead costs more than the GEMM kernel saves.
+
+This means your model's architecture directly determines how much you stand to gain
+from precision scaling:
+
+- **Models with large hidden dimensions and intermediate sizes** produce large GEMMs
+  where FP4/FP8 tensor cores have enough work to overcome overhead. These models see
+  meaningful speedups.
+- **Models with smaller dimensions** (or individual projections like attention output,
+  which has K=N=hidden_size and no expansion) may see little to no benefit from lower
+  precision, because the GEMM is too small for the faster kernel to outrun the
+  quantization cost.
+- **Batch size and sequence length also matter** -- they determine M, the token
+  dimension. A larger M means more rows in each GEMM, which again helps amortize
+  overhead.
+
+This is why benchmarking with your actual config matters. The theoretical tensor core
+speedup (e.g., 2x for FP4 vs FP8) is an upper bound that assumes the GEMM is large
+enough to saturate the hardware. This also makes the tool useful for architecture
+co-design: run candidate configs through the tool before committing to a training
+run to see how each choice affects low-precision gains.
+
+
+Worked Example: 5B Model on B300
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using a 5B-parameter model config (hidden=4096, intermediate=16384, MBS=31,
+seq_len=512, 24 layers), the full model config benchmark was run on a B300 with
+``--verify-dgrad`` to measure Dgrad times directly.
+
+Looking at the per-shape NVFP4 vs MXFP8 speedups from the Fprop results:
+
+.. code-block:: text
+
+    QKV proj:   0.544 / 0.390  =  1.39x
+    Attn out:   0.265 / 0.253  =  1.05x  (barely faster -- overhead nearly matches GEMM gain)
+    MLP up:     0.873 / 0.636  =  1.37x
+    MLP down:   1.011 / 0.665  =  1.52x
+
+A few things stand out:
+
+**The attn out GEMM (15872x4096x4096) gets minimal benefit from FP4.** At 0.253 ms
+(NVFP4) vs 0.296 ms (BF16), the speedup is only 1.17x. This is the smallest weight
+matrix (4096x4096), and it is barely large enough for lower precision to overcome the
+overhead.
+
+**The big GEMMs show real but sub-theoretical gains.** The FP4 tensor cores deliver
+1.37--1.52x over MXFP8 on the large GEMMs -- well short of the theoretical 2--3x from
+the hardware spec. Once you include the dead-weight attn out, the blended Fprop
+speedup drops to 1.38x. After adding Wgrad times, non-GEMM overhead (attention,
+layernorm, communication), and NVFP4-specific quantization costs (Hadamard transforms,
+stochastic rounding, 2D scaling), a ~2--3% end-to-end gap between NVFP4 and MXFP8
+in training is consistent with these kernel-level numbers.
+
+**The pre-quantized results reveal the true kernel potential.** Running with
+``--pre-quantize`` removes quantization overhead entirely, and NVFP4 vs BF16 jumps from
+1.90x (autocast) to 3.42x (kernel-only). This shows the FP4 tensor cores are delivering
+real speedup -- it is the quantization overhead in autocast mode that narrows the gap.
+
+**The** ``--verify-dgrad`` **comparison reveals the x2 approximation is imprecise for
+quantized formats.** While BF16 Dgrad is within 1% of Fprop, quantized formats show
+7--12% slower Dgrad sums. The QKV Proj Dgrad is especially asymmetric -- 35--48% slower
+than Fprop for FP8/FP4 -- because swapping K (4096) and N (12288) dramatically changes
+the matrix aspect ratio and kernel selection.
+
+
+Interpreting the Results
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have the GEMM-only speedup, compare it against your observed end-to-end
+training speedup:
+
+- **GEMM speedup ~ training speedup** -- GEMMs are the bottleneck, everything is
+  working as expected.
+- **GEMM speedup >> training speedup** -- overhead outside of GEMMs is eating the
+  gains. For NVFP4 in particular, this overhead includes Random Hadamard transforms
+  on Wgrad inputs, stochastic rounding on gradients, 2D block scaling for weights,
+  and the extra memory pass for per-tensor amax computation.
+- **GEMM speedup ~ 1.0** even in the microbenchmark -- the FP4 kernels are not
+  actually faster at these shapes, or they are silently falling back to FP8.
+
+The last case is especially worth checking. Set ``NVTE_LOG_LEVEL=1`` or inspect with
+Nsight Systems to confirm that Transformer Engine is actually dispatching FP4 kernels.
+TE can silently fall back to FP8 or BF16 for layers or ops that do not support FP4
+yet.
+
+
+What GEMMs Do Not Cover
+-------------------------
+
+The linear projection GEMMs are the only ops where Transformer Engine's precision
+setting (BF16 vs FP8 Block vs MXFP8 vs NVFP4) affects compute performance. The other
+major consumers in a transformer layer are **precision-agnostic** -- they run the same
+regardless of which TE mode you use:
+
+- **Attention (QK^T and softmax*V):** Runs in BF16/FP16 via FlashAttention regardless
+  of linear layer precision.
+- **LayerNorm / RMSNorm:** Typically in FP32, negligible cost.
+- **Activation functions:** Element-wise, memory-bound, unaffected by weight precision.
+- **AllReduce (DDP/FSDP):** Communication cost, independent of compute precision.
+
+In addition, NVFP4 introduces **precision-specific overhead** that falls outside the
+GEMM kernels but is unique to FP4 mode. These ops do not exist in BF16 or MXFP8 and
+represent additional cost that NVFP4 must overcome to deliver a net speedup:
+
+- **Random Hadamard transforms:** 16x16 batched matmuls applied to both Wgrad inputs
+  to improve quantization quality.
+- **Stochastic rounding:** Applied to gradients before FP4 quantization.
+- **2D block scaling:** Weight scaling with finer granularity than MXFP8's 1D scaling.
+- **Per-tensor amax passes:** Extra memory pass to compute scaling factors.
+
+This distinction matters: the precision-agnostic ops dilute GEMM speedups equally
+across all modes, but the NVFP4-specific ops actively widen the gap between NVFP4's
+raw kernel speedup and its end-to-end speedup. This is why the autocast vs
+pre-quantized comparison is informative -- the pre-quantized numbers show what the
+tensor cores can do, while the autocast numbers include both categories of overhead.
+
+
+Manual Shape Mode
+------------------
+
+If you need to benchmark shapes that do not map to a standard transformer config --
+diffusion models, mixture-of-experts, or non-standard architectures -- or want to
+profile individual GEMMs in isolation, you can pass explicit MxKxN triplets with the
+``--shapes`` flag:
+
+.. code-block:: bash
+
+    # Fprop shapes for the 5B config
+    python benchmarks/gemm/benchmark_gemm.py -o roofline_fprop.png \
+      --shapes 15872x4096x12288,15872x4096x4096,15872x4096x16384,15872x16384x4096
+
+    # Dgrad shapes (K and N swapped from Fprop)
+    python benchmarks/gemm/benchmark_gemm.py -o roofline_dgrad.png \
+      --shapes 15872x12288x4096,15872x4096x4096,15872x16384x4096,15872x4096x16384
+
+    # Wgrad shapes
+    python benchmarks/gemm/benchmark_gemm.py -o roofline_wgrad.png \
+      --shapes 4096x15872x12288,4096x15872x4096,4096x15872x16384,16384x15872x4096
+
+This mode prints per-shape TFLOPS and ms but does not compute per-layer or full-model
+totals -- you would sum the ms values and compute speedups manually. The ``--shapes``
+flag is mutually exclusive with model config arguments.
diff --git a/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup.png b/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup.png
new file mode 100644
index 0000000000..e88ada268e
Binary files /dev/null and b/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup.png differ
diff --git a/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup_prequant.png b/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup_prequant.png
new file mode 100644
index 0000000000..647e25316a
Binary files /dev/null and b/docs/features/low_precision_training/gemm_profiling/img/model_config_speedup_prequant.png differ
diff --git a/docs/features/low_precision_training/index.rst b/docs/features/low_precision_training/index.rst
index 8b392d2bbb..829130a708 100644
--- a/docs/features/low_precision_training/index.rst
+++ b/docs/features/low_precision_training/index.rst
@@ -14,4 +14,5 @@ Low precision training
    fp8_delayed_scaling/fp8_delayed_scaling.rst
    fp8_blockwise_scaling/fp8_blockwise_scaling.rst
    mxfp8/mxfp8.rst
-   nvfp4/nvfp4.rst
\ No newline at end of file
+   nvfp4/nvfp4.rst
+   gemm_profiling/gemm_profiling.rst