ROCm · mmakevic-amd · Mar 27, 2026 · Mar 27, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/benchmarks/build_binaries.sh b/.github/workflows/benchmarks/build_binaries.sh
@@ -54,8 +54,8 @@ configure_backend() {
       ./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
       ;;
     GPU_MI250)
-      echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc"
-      ./configure.py --backend=ROCM --rocm_compiler=hipcc || echo "INFO: GPU Configure script failed or is not applicable."
+      echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18"
+      ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18|| echo "INFO: GPU Configure script failed or is not applicable."
       ;;
     *)
       echo "INFO: Unknown hardware category '$hw_category_upper_for_configure'"

diff --git a/.github/workflows/generate_benchmark_matrix.yml b/.github/workflows/generate_benchmark_matrix.yml
@@ -44,7 +44,7 @@ jobs:
   generate:
     name: Generate Matrix (${{ inputs.workflow_type }})
     runs-on: linux-mi250-4
-    container: us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest
+    container: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa
     outputs:
       matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
     defaults:
@@ -65,7 +65,7 @@ jobs:
         run: |
           echo "Configuring OpenXLA for CPU to build the generator tool..."
           if [ -f "./configure.py" ]; then
-            ./configure.py --backend=CPU 
+            ./configure.py --backend=CPU --clang_path=/lib/llvm-18/bin/clang-18 
           else
             echo "::warning::configure.py not found. Assuming C++ tool build doesn't require it or is pre-configured."
           fi
@@ -78,16 +78,14 @@ jobs:
                 --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
                 --config=warnings \
                 --config=nonccl \
-                --config=rbe_linux_cpu \
                 --color=yes \
                 --test_output=errors \
                 --verbose_failures \
                 --keep_going \
                 --nobuild_tests_only \
                 --profile=profile.json.gz \
                 --flaky_test_attempts=3 \
-                --jobs=150 \
-                --bes_upload_mode=fully_async \
+                --bes_backend="" \
                 //xla/tools/benchmarks/utils:generate_benchmark_matrices_main
           if [ $? -ne 0 ]; then
              echo "::error::Failed to build generate_benchmark_matrices_main"

diff --git a/.github/workflows/postsubmit_benchmark.yml b/.github/workflows/postsubmit_benchmark.yml
@@ -66,7 +66,9 @@ jobs:
          benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json || '[]') }}
 
     runs-on: ${{ matrix.benchmark_entry.runner_label }}
-    container: ${{ matrix.benchmark_entry.container_image }}
+    container: 
+      image: ${{ matrix.benchmark_entry.container_image }}
+      options: --device=/dev/dri --device=/dev/kfd
 
     defaults:
       run:
@@ -101,11 +103,6 @@ jobs:
       COMPARISON_SCRIPT_RELATIVE: .github/workflows/benchmarks/compare_with_baseline.py
 
     steps:
-      - name: "Wait For Connection"
-        uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
-        with:
-          halt-dispatch-input: ${{ inputs.halt-for-connection }}
-
       - name: Print Job Info & Set Full Paths in ENV
         run: |
           # Resolve full paths based on GITHUB_WORKSPACE and relative paths defined in env
@@ -202,26 +199,6 @@ jobs:
           echo "Baseline comparison finished."
           echo "---------------------------------------------"
 
-      - name: Upload results.json directly to GCS
-        run: |
-          GCS_BUCKET="gs://openxla-postsubmit-transient"
-          RESULTS_JSON_FILE_PATH="${{ env.RESOLVED_OUTPUT_DIR }}/results.json"
-
-          # Check if the results file exists
-          if [ ! -f "$RESULTS_JSON_FILE_PATH" ]; then
-            echo "::error::results.json not found at $RESULTS_JSON_FILE_PATH"
-            exit 1
-          fi
-
-          # Construct a GCS object name
-          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-          DATE_FOLDER=$(date +%Y%m%d)
-          COMMIT_SHA_SHORT=$(echo "${{ github.sha }}" | cut -c1-8)
-          GCS_OBJECT_NAME="${BENCHMARK_NAME}/${DATE_FOLDER}/${TIMESTAMP}_run_${WORKFLOW_RUN_ID}_commit_${COMMIT_SHA_SHORT}.json"
-
-          echo "Uploading $RESULTS_JSON_FILE_PATH to $GCS_BUCKET/$GCS_OBJECT_NAME"
-          gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
-
       - name: Upload Benchmark Artifacts
         if: always()
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0

diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py
@@ -674,7 +674,7 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS,
     repo="openxla/xla",
-    configs=("rocm_ci"),
+    configs=("rocm_ci",),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     test_tag_filters=rocm_tag_filters,
     build_tag_filters=rocm_tag_filters,

diff --git a/xla/tools/benchmarks/baseline/postsubmit_baseline.yml b/xla/tools/benchmarks/baseline/postsubmit_baseline.yml
@@ -35,9 +35,9 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
-  "gemma3_1b_flax_call_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
+  "gemma3_1b_flax_call_mi250_1h1d_postsubmit": {
     "GPU_DEVICE_TIME": {
-      "baseline_ms": 1000,
+      "baseline_ms": 5.6,
       "threshold": 0.30 # Allow 30% regression max
     },
     "GPU_DEVICE_MEMCPY_TIME": {
@@ -75,9 +75,9 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
-  "gemma2_2b_keras_jax_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
+  "gemma2_2b_keras_jax_mi250_1h1d_postsubmit": { 
     "GPU_DEVICE_TIME": {
-      "baseline_ms": 1000,
+      "baseline_ms": 205,
       "threshold": 0.30 # Allow 30% regression max
     },
     "GPU_DEVICE_MEMCPY_TIME": {

diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml
@@ -31,7 +31,7 @@ benchmarks: [
       topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
       target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
       workflow_type: [POSTSUBMIT]
-      runtime_flags: ["--num_repeats=5"]
+      runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"]
     }
     ]
     update_frequency_policy: QUARTERLY
@@ -51,7 +51,8 @@ benchmarks: [
       topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
       target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
       workflow_type: [POSTSUBMIT]
-      runtime_flags: ["--num_repeats=5"]
+      runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"]
+      xla_compilation_flags: ["--xla_gpu_enable_command_buffer="]
     }]
     update_frequency_policy: QUARTERLY
     # TODO(juliagmt): remove this label once the benchmark is stable.