ROCm · mmakevic-amd · Mar 27, 2026 · Mar 27, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/benchmarks/build_binaries.sh b/.github/workflows/benchmarks/build_binaries.sh
@@ -54,8 +54,8 @@ configure_backend() {
       ./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
       ;;
     GPU_MI250)
-      echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc"
-      ./configure.py --backend=ROCM --rocm_compiler=hipcc || echo "INFO: GPU Configure script failed or is not applicable."
+      echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18"
+      ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18|| echo "INFO: GPU Configure script failed or is not applicable."
       ;;
     *)
       echo "INFO: Unknown hardware category '$hw_category_upper_for_configure'"

diff --git a/.github/workflows/generate_benchmark_matrix.yml b/.github/workflows/generate_benchmark_matrix.yml
@@ -44,7 +44,7 @@ jobs:
   generate:
     name: Generate Matrix (${{ inputs.workflow_type }})
     runs-on: linux-mi250-4
-    container: us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest
+    container: ${{ vars.DOCKER_IMAGE }}
     outputs:
       matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
     defaults:
@@ -65,7 +65,7 @@ jobs:
         run: |
           echo "Configuring OpenXLA for CPU to build the generator tool..."
           if [ -f "./configure.py" ]; then
-            ./configure.py --backend=CPU 
+            ./configure.py --backend=CPU --clang_path=/lib/llvm-18/bin/clang-18 
           else
             echo "::warning::configure.py not found. Assuming C++ tool build doesn't require it or is pre-configured."
           fi
@@ -78,16 +78,14 @@ jobs:
                 --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
                 --config=warnings \
                 --config=nonccl \
-                --config=rbe_linux_cpu \
                 --color=yes \
                 --test_output=errors \
                 --verbose_failures \
                 --keep_going \
                 --nobuild_tests_only \
                 --profile=profile.json.gz \
                 --flaky_test_attempts=3 \
-                --jobs=150 \
-                --bes_upload_mode=fully_async \
+                --bes_backend="" \
                 //xla/tools/benchmarks/utils:generate_benchmark_matrices_main
           if [ $? -ne 0 ]; then
              echo "::error::Failed to build generate_benchmark_matrices_main"

diff --git a/.github/workflows/postsubmit_benchmark.yml b/.github/workflows/postsubmit_benchmark.yml
@@ -66,7 +66,9 @@ jobs:
          benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json || '[]') }}
 
     runs-on: ${{ matrix.benchmark_entry.runner_label }}
-    container: ${{ matrix.benchmark_entry.container_image }}
+    container: 
+      image: ${{ matrix.benchmark_entry.container_image }}
+      options: --device=/dev/dri --device=/dev/kfd
 
     defaults:
       run:
@@ -101,11 +103,6 @@ jobs:
       COMPARISON_SCRIPT_RELATIVE: .github/workflows/benchmarks/compare_with_baseline.py
 
     steps:
-      - name: "Wait For Connection"
-        uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
-        with:
-          halt-dispatch-input: ${{ inputs.halt-for-connection }}
-
       - name: Print Job Info & Set Full Paths in ENV
         run: |
           # Resolve full paths based on GITHUB_WORKSPACE and relative paths defined in env
@@ -149,6 +146,15 @@ jobs:
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
+      - name: Get RBE cluster keys
+        env:
+          RBE_CI_CERT: ${{ secrets.RBE_CI_CERT }}
+          RBE_CI_KEY: ${{ secrets.RBE_CI_KEY }}
+        run: |
+          mkdir -p /tf/certificates
+          echo "$RBE_CI_CERT" > /tf/certificates/ci-cert.crt
+          echo "$RBE_CI_KEY" > /tf/certificates/ci-cert.key
+
       - name: Build Binaries
         id: build_binaries
         run: |
@@ -202,26 +208,6 @@ jobs:
           echo "Baseline comparison finished."
           echo "---------------------------------------------"
 
-      - name: Upload results.json directly to GCS
-        run: |
-          GCS_BUCKET="gs://openxla-postsubmit-transient"
-          RESULTS_JSON_FILE_PATH="${{ env.RESOLVED_OUTPUT_DIR }}/results.json"
-
-          # Check if the results file exists
-          if [ ! -f "$RESULTS_JSON_FILE_PATH" ]; then
-            echo "::error::results.json not found at $RESULTS_JSON_FILE_PATH"
-            exit 1
-          fi
-
-          # Construct a GCS object name
-          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-          DATE_FOLDER=$(date +%Y%m%d)
-          COMMIT_SHA_SHORT=$(echo "${{ github.sha }}" | cut -c1-8)
-          GCS_OBJECT_NAME="${BENCHMARK_NAME}/${DATE_FOLDER}/${TIMESTAMP}_run_${WORKFLOW_RUN_ID}_commit_${COMMIT_SHA_SHORT}.json"
-
-          echo "Uploading $RESULTS_JSON_FILE_PATH to $GCS_BUCKET/$GCS_OBJECT_NAME"
-          gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
-
       - name: Upload Benchmark Artifacts
         if: always()
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0

diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py
@@ -674,18 +674,23 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS,
     repo="openxla/xla",
-    configs=("rocm_ci"),
+    configs=("rocm_ci", "rocm_rbe"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     test_tag_filters=rocm_tag_filters,
     build_tag_filters=rocm_tag_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
+        "remote_download_toplevel": True,  # Override remote_download_minimal from rocm_rbe
         **_DEFAULT_BAZEL_OPTIONS,
     },
     repo_env={
         "TF_ROCM_AMDGPU_TARGETS": "gfx90a",
+        "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/"
+           "tensorflow-build@sha256:"
+           "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"
     },
+    startup_options={"bazelrc": "build_tools/rocm/rocm_xla.bazelrc"},
     subcommand="build",
 )
 

diff --git a/xla/tools/benchmarks/baseline/postsubmit_baseline.yml b/xla/tools/benchmarks/baseline/postsubmit_baseline.yml
@@ -35,9 +35,9 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
-  "gemma3_1b_flax_call_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
+  "gemma3_1b_flax_call_mi250_1h1d_postsubmit": {
     "GPU_DEVICE_TIME": {
-      "baseline_ms": 1000,
+      "baseline_ms": 5.6,
       "threshold": 0.30 # Allow 30% regression max
     },
     "GPU_DEVICE_MEMCPY_TIME": {
@@ -75,9 +75,9 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
-  "gemma2_2b_keras_jax_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
+  "gemma2_2b_keras_jax_mi250_1h1d_postsubmit": { 
     "GPU_DEVICE_TIME": {
-      "baseline_ms": 1000,
+      "baseline_ms": 205,
       "threshold": 0.30 # Allow 30% regression max
     },
     "GPU_DEVICE_MEMCPY_TIME": {

diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml
@@ -32,8 +32,7 @@ benchmarks: [
       target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
       workflow_type: [POSTSUBMIT]
       runtime_flags: ["--num_repeats=5"]
-    }
-    ]
+    }]
     update_frequency_policy: QUARTERLY
   },
   {

diff --git a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -127,8 +127,10 @@ GetHardwareToContainerImage() {
           {"GPU_L4_1H_4D",
            "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
            "ml-build-cuda12.8-cudnn9.8:latest"},
-           {"GPU_MI250", 
-            "rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa"},
+          {"GPU_MI250",
+           "rocm/"
+           "tensorflow-build@sha256:"
+           "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"},
       };
   return *kHardwareToContainerImage;
 }