Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/benchmarks/build_binaries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ configure_backend() {
./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
;;
GPU_MI250)
echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc"
./configure.py --backend=ROCM --rocm_compiler=hipcc || echo "INFO: GPU Configure script failed or is not applicable."
echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18"
./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18|| echo "INFO: GPU Configure script failed or is not applicable."
;;
*)
echo "INFO: Unknown hardware category '$hw_category_upper_for_configure'"
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/generate_benchmark_matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
generate:
name: Generate Matrix (${{ inputs.workflow_type }})
runs-on: linux-mi250-4
container: us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest
container: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa
outputs:
matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
defaults:
Expand All @@ -65,7 +65,7 @@ jobs:
run: |
echo "Configuring OpenXLA for CPU to build the generator tool..."
if [ -f "./configure.py" ]; then
./configure.py --backend=CPU
./configure.py --backend=CPU --clang_path=/lib/llvm-18/bin/clang-18
else
echo "::warning::configure.py not found. Assuming C++ tool build doesn't require it or is pre-configured."
fi
Expand All @@ -78,16 +78,14 @@ jobs:
--test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
--config=warnings \
--config=nonccl \
--config=rbe_linux_cpu \
--color=yes \
--test_output=errors \
--verbose_failures \
--keep_going \
--nobuild_tests_only \
--profile=profile.json.gz \
--flaky_test_attempts=3 \
--jobs=150 \
--bes_upload_mode=fully_async \
--bes_backend="" \
//xla/tools/benchmarks/utils:generate_benchmark_matrices_main
if [ $? -ne 0 ]; then
echo "::error::Failed to build generate_benchmark_matrices_main"
Expand Down
29 changes: 3 additions & 26 deletions .github/workflows/postsubmit_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ jobs:
benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json || '[]') }}

runs-on: ${{ matrix.benchmark_entry.runner_label }}
container: ${{ matrix.benchmark_entry.container_image }}
container:
image: ${{ matrix.benchmark_entry.container_image }}
options: --device=/dev/dri --device=/dev/kfd

defaults:
run:
Expand Down Expand Up @@ -101,11 +103,6 @@ jobs:
COMPARISON_SCRIPT_RELATIVE: .github/workflows/benchmarks/compare_with_baseline.py

steps:
- name: "Wait For Connection"
uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
with:
halt-dispatch-input: ${{ inputs.halt-for-connection }}

- name: Print Job Info & Set Full Paths in ENV
run: |
# Resolve full paths based on GITHUB_WORKSPACE and relative paths defined in env
Expand Down Expand Up @@ -202,26 +199,6 @@ jobs:
echo "Baseline comparison finished."
echo "---------------------------------------------"

- name: Upload results.json directly to GCS
run: |
GCS_BUCKET="gs://openxla-postsubmit-transient"
RESULTS_JSON_FILE_PATH="${{ env.RESOLVED_OUTPUT_DIR }}/results.json"

# Check if the results file exists
if [ ! -f "$RESULTS_JSON_FILE_PATH" ]; then
echo "::error::results.json not found at $RESULTS_JSON_FILE_PATH"
exit 1
fi

# Construct a GCS object name
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DATE_FOLDER=$(date +%Y%m%d)
COMMIT_SHA_SHORT=$(echo "${{ github.sha }}" | cut -c1-8)
GCS_OBJECT_NAME="${BENCHMARK_NAME}/${DATE_FOLDER}/${TIMESTAMP}_run_${WORKFLOW_RUN_ID}_commit_${COMMIT_SHA_SHORT}.json"

echo "Uploading $RESULTS_JSON_FILE_PATH to $GCS_BUCKET/$GCS_OBJECT_NAME"
gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"

- name: Upload Benchmark Artifacts
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
Expand Down
2 changes: 1 addition & 1 deletion build_tools/ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ def nvidia_gpu_build_with_compute_capability(
Build(
type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS,
repo="openxla/xla",
configs=("rocm_ci"),
configs=("rocm_ci",),
target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
test_tag_filters=rocm_tag_filters,
build_tag_filters=rocm_tag_filters,
Expand Down
8 changes: 4 additions & 4 deletions xla/tools/benchmarks/baseline/postsubmit_baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
"threshold": 0.30 # Allow 30% regression max
}
},
"gemma3_1b_flax_call_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
"gemma3_1b_flax_call_mi250_1h1d_postsubmit": {
"GPU_DEVICE_TIME": {
"baseline_ms": 1000,
"baseline_ms": 5.6,
"threshold": 0.30 # Allow 30% regression max
},
"GPU_DEVICE_MEMCPY_TIME": {
Expand Down Expand Up @@ -75,9 +75,9 @@
"threshold": 0.30 # Allow 30% regression max
}
},
"gemma2_2b_keras_jax_mi250_1h1d_postsubmit": { #TODO To be adapted once initial benchmarks are run
"gemma2_2b_keras_jax_mi250_1h1d_postsubmit": {
"GPU_DEVICE_TIME": {
"baseline_ms": 1000,
"baseline_ms": 205,
"threshold": 0.30 # Allow 30% regression max
},
"GPU_DEVICE_MEMCPY_TIME": {
Expand Down
5 changes: 3 additions & 2 deletions xla/tools/benchmarks/registries/default_registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ benchmarks: [
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"]
}
]
update_frequency_policy: QUARTERLY
Expand All @@ -51,7 +51,8 @@ benchmarks: [
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"]
xla_compilation_flags: ["--xla_gpu_enable_command_buffer="]
}]
update_frequency_policy: QUARTERLY
# TODO(juliagmt): remove this label once the benchmark is stable.
Expand Down
Loading