diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml new file mode 100644 index 0000000000..6a72f06b45 --- /dev/null +++ b/.github/workflows/benchmark_tests.yml @@ -0,0 +1,310 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Integration tests: build an isolated Docker image per benchmark and run a +# 1-episode smoke eval. Each benchmark gets its own image so incompatible +# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide. +# +# To add a new benchmark: +# 1. Add docker/Dockerfile.benchmark. (install only lerobot[]) +# 2. Copy one of the jobs below and adjust the image name and eval command. +name: Benchmark Integration Tests + +on: + # Run manually from the Actions tab + workflow_dispatch: + + # Run every Monday at 02:00 UTC. + schedule: + - cron: "0 2 * * 1" + + push: + branches: + - main + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + + pull_request: + branches: + - main + - feat/benchmark-ci + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + +permissions: + contents: read + +env: + UV_VERSION: "0.8.0" + PYTHON_VERSION: "3.12" + +# Cancel in-flight runs for the same branch/PR. +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + # ── LIBERO ──────────────────────────────────────────────────────────────── + # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain) + libero-integration-test: + name: Libero — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + + # Build the benchmark-specific image. The Dockerfile separates dep-install + # from source-copy, so code-only changes skip the slow uv-sync layer + # when the runner has a warm Docker daemon cache. + - name: Build Libero benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero + push: false + load: true + tags: lerobot-benchmark-libero:ci + + - name: Run Libero smoke eval (1 episode) + run: | + # Named container (no --rm) so we can docker cp artifacts out. + # Output to /tmp inside the container — /artifacts doesn't exist + # and user_lerobot cannot create root-level dirs. + docker run --name libero-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-libero:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_libero \ + --env.type=libero \ + --env.task=libero_spatial \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero --task libero_spatial \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy Libero artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-artifacts + docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true + docker rm -f libero-eval || true + + - name: Parse Libero eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-artifacts \ + --env libero \ + --task libero_spatial \ + --policy pepijn223/smolvla_libero + + - name: Upload Libero rollout video + if: always() + uses: actions/upload-artifact@v4 + with: + name: libero-rollout-video + path: /tmp/libero-artifacts/videos/ + if-no-files-found: warn + + - name: Upload Libero eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: libero-metrics + path: /tmp/libero-artifacts/metrics.json + if-no-files-found: warn + + # ── LIBERO TRAIN+EVAL SMOKE ────────────────────────────────────────────── + # Train SmolVLA for 1 step (batch_size=1, dataset episode 0 only) then + # immediately runs eval inside the training loop (eval_freq=1, 1 episode). + # Tests the full train→eval-within-training pipeline end-to-end. + - name: Run Libero train+eval smoke (1 step, eval_freq=1) + run: | + docker run --name libero-train-smoke --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-libero:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + accelerate launch --num_processes=1 \$(which lerobot-train) \ + --policy.path=lerobot/smolvla_base \ + --policy.load_vlm_weights=true \ + --policy.scheduler_decay_steps=25000 \ + --policy.freeze_vision_encoder=false \ + --policy.train_expert_only=false \ + --dataset.repo_id=lerobot/libero \ + --dataset.episodes=[0] \ + --dataset.use_imagenet_stats=false \ + --env.type=libero \ + --env.task=libero_spatial \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/train-smoke \ + --steps=1 \ + --batch_size=1 \ + --eval_freq=1 \ + --eval.n_episodes=1 \ + --eval.batch_size=1 \ + --eval.use_async_envs=false \ + --save_freq=1 \ + --policy.push_to_hub=false \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.image2\": \"observation.images.camera2\"}' + " + + - name: Copy Libero train-smoke artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-train-smoke-artifacts + docker cp libero-train-smoke:/tmp/train-smoke/. /tmp/libero-train-smoke-artifacts/ 2>/dev/null || true + docker rm -f libero-train-smoke || true + + - name: Upload Libero train-smoke eval video + if: always() + uses: actions/upload-artifact@v4 + with: + name: libero-train-smoke-video + path: /tmp/libero-train-smoke-artifacts/eval/ + if-no-files-found: warn + + # ── METAWORLD ───────────────────────────────────────────────────────────── + # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) + metaworld-integration-test: + name: MetaWorld — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + + - name: Build MetaWorld benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.metaworld + push: false + load: true + tags: lerobot-benchmark-metaworld:ci + + - name: Run MetaWorld smoke eval (1 episode) + run: | + docker run --name metaworld-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-metaworld:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_metaworld \ + --env.type=metaworld \ + --env.task=metaworld-push-v3 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.image\": \"observation.images.camera1\"}' \ + --policy.empty_cameras=2 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env metaworld --task metaworld-push-v3 \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy MetaWorld artifacts from container + if: always() + run: | + mkdir -p /tmp/metaworld-artifacts + docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true + docker rm -f metaworld-eval || true + + - name: Parse MetaWorld eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/metaworld-artifacts \ + --env metaworld \ + --task metaworld-push-v3 \ + --policy pepijn223/smolvla_metaworld + + - name: Upload MetaWorld rollout video + if: always() + uses: actions/upload-artifact@v4 + with: + name: metaworld-rollout-video + path: /tmp/metaworld-artifacts/videos/ + if-no-files-found: warn + + - name: Upload MetaWorld eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: metaworld-metrics + path: /tmp/metaworld-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero new file mode 100644 index 0000000000..a16179c7d7 --- /dev/null +++ b/docker/Dockerfile.benchmark.libero @@ -0,0 +1,99 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Isolated benchmark image for LIBERO integration tests. +# Installs only lerobot[libero] so its dep tree (hf-libero, dm-control, mujoco) +# cannot conflict with other benchmarks. +# +# Build: docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero . +# Run: docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ... + +ARG CUDA_VERSION=12.4.1 +ARG OS_VERSION=22.04 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} + +ARG PYTHON_VERSION=3.12 + +ENV DEBIAN_FRONTEND=noninteractive \ + MUJOCO_GL=egl \ + PATH=/lerobot/.venv/bin:$PATH \ + CUDA_VISIBLE_DEVICES=0 \ + DEVICE=cuda + +# System deps — same set as Dockerfile.internal +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common build-essential git curl \ + libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ + libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ + cmake pkg-config ninja-build \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \ + && mv /root/.local/bin/uv /usr/local/bin/uv \ + && useradd --create-home --shell /bin/bash user_lerobot \ + && usermod -aG sudo user_lerobot \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /lerobot +RUN chown -R user_lerobot:user_lerobot /lerobot +USER user_lerobot + +ENV HOME=/home/user_lerobot \ + HF_HOME=/home/user_lerobot/.cache/huggingface \ + HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ + TORCH_HOME=/home/user_lerobot/.cache/torch \ + TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton + +RUN uv venv --python python${PYTHON_VERSION} + +# ── Dependency layer (cached unless pyproject.toml / uv.lock change) ──────── +# Copy only the files uv needs to resolve deps, plus a minimal package stub +# so the editable install can succeed without the full source tree. +# Uses `uv pip install` instead of `uv sync` because uv sync validates the +# entire lockfile across all extras — robomme's numpy<2.0 conflicts with the +# base numpy>=2.0, making the full lockfile unsatisfiable. pip-style install +# only resolves the requested extras for the current platform. +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +RUN mkdir -p src/lerobot && touch src/lerobot/__init__.py src/lerobot/py.typed + +RUN uv pip install --no-cache -e ".[libero,smolvla]" + +# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at +# runtime (which times out on CI). Point the libero config at the cached path. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing, +# so we write the config before any libero import can happen. +RUN LIBERO_DIR=$(python${PYTHON_VERSION} -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + python${PYTHON_VERSION} -c "\ +from huggingface_hub import snapshot_download; \ +snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ + local_dir='/home/user_lerobot/.libero/assets')" && \ + printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Workaround: Triton ships ptxas without the execute bit set. +# Without this chmod, any JIT compilation (e.g. torch.compile) fails +# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due +RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas + +# ── Source layer (rebuilds in seconds on code-only changes) ───────────────── +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld new file mode 100644 index 0000000000..f6a61e1b43 --- /dev/null +++ b/docker/Dockerfile.benchmark.metaworld @@ -0,0 +1,82 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Isolated benchmark image for MetaWorld integration tests. +# Installs only lerobot[metaworld] so its dep tree (metaworld==3.0.0, mujoco>=3) +# cannot conflict with other benchmarks. +# +# Build: docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld . +# Run: docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ... + +ARG CUDA_VERSION=12.4.1 +ARG OS_VERSION=22.04 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} + +ARG PYTHON_VERSION=3.12 + +ENV DEBIAN_FRONTEND=noninteractive \ + MUJOCO_GL=egl \ + PATH=/lerobot/.venv/bin:$PATH \ + CUDA_VISIBLE_DEVICES=0 \ + DEVICE=cuda + +# System deps — same set as Dockerfile.internal +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common build-essential git curl \ + libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ + libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ + cmake pkg-config ninja-build \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \ + && mv /root/.local/bin/uv /usr/local/bin/uv \ + && useradd --create-home --shell /bin/bash user_lerobot \ + && usermod -aG sudo user_lerobot \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /lerobot +RUN chown -R user_lerobot:user_lerobot /lerobot +USER user_lerobot + +ENV HOME=/home/user_lerobot \ + HF_HOME=/home/user_lerobot/.cache/huggingface \ + HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ + TORCH_HOME=/home/user_lerobot/.cache/torch \ + TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton + +RUN uv venv --python python${PYTHON_VERSION} + +# ── Dependency layer (cached unless pyproject.toml / uv.lock change) ──────── +# Copy only the files uv needs to resolve deps, plus a minimal package stub +# so the editable install can succeed without the full source tree. +# Uses `uv pip install` instead of `uv sync` — see Dockerfile.benchmark.libero +# for rationale (cross-extra numpy conflict with robomme). +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +RUN mkdir -p src/lerobot && touch src/lerobot/__init__.py src/lerobot/py.typed + +RUN uv pip install --no-cache -e ".[metaworld,smolvla]" + +# Workaround: Triton ships ptxas without the execute bit set. +# Without this chmod, any JIT compilation (e.g. torch.compile) fails +# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due +RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas + +# ── Source layer (rebuilds in seconds on code-only changes) ───────────────── +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py new file mode 100644 index 0000000000..5fbc1c35aa --- /dev/null +++ b/scripts/ci/extract_task_descriptions.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Extract natural-language task descriptions for a benchmark suite. + +Runs inside the benchmark Docker container (where the env library is installed) +immediately after lerobot-eval, writing a JSON file that parse_eval_metrics.py +picks up and embeds in metrics.json. + +Output format: {"_": "", ...} + +Usage: + python scripts/ci/extract_task_descriptions.py \\ + --env libero --task libero_spatial \\ + --output /tmp/eval-artifacts/task_descriptions.json +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +def _libero_descriptions(task_suite: str) -> dict[str, str]: + from libero.libero import benchmark # type: ignore[import-untyped] + + suite_dict = benchmark.get_benchmark_dict() + if task_suite not in suite_dict: + print( + f"[extract_task_descriptions] Unknown LIBERO suite '{task_suite}'. " + f"Available: {list(suite_dict.keys())}", + file=sys.stderr, + ) + return {} + suite = suite_dict[task_suite]() + return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)} + + +def _metaworld_descriptions(task_name: str) -> dict[str, str]: + # MetaWorld tasks don't expose a separate NL description attribute; + # use a cleaned version of the task name as the description. + label = task_name.removeprefix("metaworld-").replace("-", " ").strip() + return {f"{task_name}_0": label} + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") + parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)") + parser.add_argument("--output", required=True, help="Path to write task_descriptions.json") + args = parser.parse_args() + + descriptions: dict[str, str] = {} + try: + if args.env == "libero": + descriptions = _libero_descriptions(args.task) + elif args.env == "metaworld": + descriptions = _metaworld_descriptions(args.task) + else: + print( + f"[extract_task_descriptions] No description extractor for env '{args.env}'.", + file=sys.stderr, + ) + except Exception as exc: + print(f"[extract_task_descriptions] Warning: {exc}", file=sys.stderr) + + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(descriptions, indent=2)) + print(f"[extract_task_descriptions] {len(descriptions)} descriptions → {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/ci/parse_eval_metrics.py b/scripts/ci/parse_eval_metrics.py new file mode 100644 index 0000000000..7666a7a5a6 --- /dev/null +++ b/scripts/ci/parse_eval_metrics.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parse lerobot-eval output into a small metrics.json artifact. + +Reads eval_info.json written by lerobot-eval --output_dir and extracts the +key metrics needed by the health dashboard. Handles both single-task and +multi-task eval output formats. + +NOTE: This script runs on the bare CI runner (not inside Docker), so it +must use only Python stdlib modules. Do not add third-party imports. + +Usage: + python scripts/ci/parse_eval_metrics.py \\ + --artifacts-dir /tmp/libero-artifacts \\ + --env libero \\ + --task libero_spatial \\ + --policy pepijn223/smolvla_libero + +Writes /metrics.json. The CI workflow then uploads this file +as a GitHub Actions artifact named "-metrics". +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path + + +def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]: + """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json. + + Handles two output shapes: + - Single-task: {"aggregated": {"pc_success": 80.0, ...}} + - Multi-task: {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}} + """ + for key in ("aggregated", "overall"): + if key not in info: + continue + agg = info[key] + pc = agg.get("pc_success") + n = agg.get("n_episodes") + reward = agg.get("avg_sum_reward") + eval_s = agg.get("eval_s") + + def _safe_float(v: float | int | None) -> float | None: + if v is None: + return None + f = float(v) + return None if math.isnan(f) else f + + if pc is not None and not math.isnan(pc): + return ( + float(pc), + int(n) if n is not None else None, + _safe_float(reward), + _safe_float(eval_s), + ) + + return None, None, None, None + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume") + parser.add_argument("--env", required=True, help="Environment name (e.g. libero)") + parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)") + parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)") + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + eval_info_path = artifacts_dir / "eval_info.json" + + pc_success: float | None = None + n_episodes: int | None = None + avg_sum_reward: float | None = None + eval_s: float | None = None + + if eval_info_path.exists(): + try: + info = json.loads(eval_info_path.read_text()) + pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info) + except (json.JSONDecodeError, KeyError, TypeError) as exc: + print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr) + else: + print( + f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.", + file=sys.stderr, + ) + + task_descriptions: dict[str, str] = {} + task_desc_path = artifacts_dir / "task_descriptions.json" + if task_desc_path.exists(): + try: + task_descriptions = json.loads(task_desc_path.read_text()) + except json.JSONDecodeError as exc: + print( + f"[parse_eval_metrics] Warning: could not parse task_descriptions.json: {exc}", + file=sys.stderr, + ) + + metrics = { + "env": args.env, + "task": args.task, + "policy": args.policy, + "pc_success": pc_success, + "n_episodes": n_episodes, + "avg_sum_reward": avg_sum_reward, + "eval_s": eval_s, + "task_descriptions": task_descriptions, + } + + out_path = artifacts_dir / "metrics.json" + out_path.write_text(json.dumps(metrics, indent=2)) + print(f"[parse_eval_metrics] Written: {out_path}") + print(json.dumps(metrics, indent=2)) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())