huggingface · pkooij · Apr 8, 2026 · Apr 8, 2026
diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
@@ -247,3 +247,99 @@ jobs:
           name: metaworld-metrics
           path: /tmp/metaworld-artifacts/metrics.json
           if-no-files-found: warn
+
+  # ── ROBOCEREBRA ───────────────────────────────────────────────────────────
+  # Isolated image: lerobot[robocerebra] only (= lerobot[libero] alias)
+  # Uses the libero_10 suite with RoboCerebra camera defaults (image/wrist_image).
+  robocerebra-integration-test:
+    name: RoboCerebra — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Build RoboCerebra benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocerebra
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocerebra:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
+          cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
+
+      - name: Login to Hugging Face
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --rm \
+            -e HF_HOME=/tmp/hf \
+            lerobot-benchmark-robocerebra:ci \
+            bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami"
+
+      - name: Run RoboCerebra smoke eval (1 episode)
+        run: |
+          docker run --name robocerebra-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e LIBERO_DATA_FOLDER=/tmp/libero_data \
+            lerobot-benchmark-robocerebra:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=pepijn223/smolvla_libero \
+                --env.type=robocerebra \
+                --env.task=libero_10 \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+            "
+
+      - name: Copy RoboCerebra artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocerebra-artifacts
+          docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
+          docker rm -f robocerebra-eval || true
+
+      - name: Parse RoboCerebra eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocerebra-artifacts \
+            --env robocerebra \
+            --task libero_10 \
+            --policy pepijn223/smolvla_libero
+
+      - name: Upload RoboCerebra rollout video
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robocerebra-rollout-video
+          path: /tmp/robocerebra-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCerebra eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robocerebra-metrics
+          path: /tmp/robocerebra-artifacts/metrics.json
+          if-no-files-found: warn
diff --git a/docker/Dockerfile.benchmark.robocerebra b/docker/Dockerfile.benchmark.robocerebra
@@ -0,0 +1,75 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Isolated benchmark image for RoboCerebra integration tests.
+# Installs only lerobot[robocerebra] (= lerobot[libero]: hf-libero, dm-control, mujoco)
+# so its dependency tree cannot conflict with other benchmarks.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
+
+ARG CUDA_VERSION=12.4.1
+ARG OS_VERSION=22.04
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
+
+ARG PYTHON_VERSION=3.12
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    MUJOCO_GL=egl \
+    PATH=/lerobot/.venv/bin:$PATH \
+    CUDA_VISIBLE_DEVICES=0 \
+    DEVICE=cuda
+
+# System deps — same set as Dockerfile.internal
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    software-properties-common build-essential git curl \
+    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
+    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
+    cmake pkg-config ninja-build \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+       python${PYTHON_VERSION} \
+       python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && mv /root/.local/bin/uv /usr/local/bin/uv \
+    && useradd --create-home --shell /bin/bash user_lerobot \
+    && usermod -aG sudo user_lerobot \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /lerobot
+RUN chown -R user_lerobot:user_lerobot /lerobot
+USER user_lerobot
+
+ENV HOME=/home/user_lerobot \
+    HF_HOME=/home/user_lerobot/.cache/huggingface \
+    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
+    TORCH_HOME=/home/user_lerobot/.cache/torch \
+    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
+
+RUN uv venv --python python${PYTHON_VERSION}
+
+# Install only lerobot[robocerebra] — completely isolated from other benchmarks' dep trees.
+# robocerebra = lerobot[libero] (hf-libero + dm-control + mujoco chain)
+COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
+COPY --chown=user_lerobot:user_lerobot src/ src/
+
+RUN uv sync --locked --extra robocerebra --no-cache
+
+RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
+
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
diff --git a/docs/source/robocerebra.md b/docs/source/robocerebra.md
@@ -0,0 +1,153 @@
+# RoboCerebra Benchmark
+
+RoboCerebra is a benchmark for evaluating long-horizon robotic manipulation with vision-language models. It contains **10 tasks** spanning kitchen, living-room, and study environments, designed to require deliberative, multi-step reasoning rather than reactive policies.
+
+- **Paper**: [RoboCerebra](https://robocerebra.github.io)
+- **GitHub**: [qiuboxiang/RoboCerebra](https://github.com/qiuboxiang/RoboCerebra)
+- **Dataset (LeRobot v3.0)**: [`CollisionCode/RoboCerebra_lerobot_v3.0`](https://huggingface.co/datasets/CollisionCode/RoboCerebra_lerobot_v3.0)
+
+## Installation
+
+RoboCerebra runs on top of the [LIBERO](https://libero-project.github.io) environment, which requires Linux.
+
+```bash
+pip install "lerobot[robocerebra]"
+```
+
+> **macOS users**: LIBERO (and therefore RoboCerebra) requires Linux. Use Docker or a remote Linux machine for evaluation.
+
+## Dataset
+
+The dataset is already in **LeRobot v3.0 format** — no conversion is needed.
+
+| Property         | Value                                                                                         |
+| ---------------- | --------------------------------------------------------------------------------------------- |
+| HuggingFace repo | `CollisionCode/RoboCerebra_lerobot_v3.0`                                                      |
+| Format           | LeRobot v3.0                                                                                  |
+| Robot            | Franka Panda (7-DOF arm + gripper)                                                            |
+| Action space     | 7-DOF delta end-effector pose + gripper                                                       |
+| Cameras          | `observation.images.image` (agent-view), `observation.images.wrist_image` (wrist)             |
+| Resolution       | 256 × 256                                                                                     |
+| FPS              | 20                                                                                            |
+| Task types       | Ideal, Memory_Execution, Memory_Exploration, Mix, Observation_Mismatching, Random_Disturbance |
+
+### Loading the dataset
+
+```python
+from lerobot.datasets import LeRobotDataset
+
+# Load the "Ideal" task type
+dataset = LeRobotDataset("CollisionCode/RoboCerebra_lerobot_v3.0", root="Ideal")
+```
+
+## Available Tasks
+
+RoboCerebra evaluates on the **libero_10** suite (10 long-horizon tasks):
+
+| Task ID | Name                                                                                                         |
+| ------- | ------------------------------------------------------------------------------------------------------------ |
+| 0       | KITCHEN_SCENE3 — turn on the stove and put the moka pot on it                                                |
+| 1       | KITCHEN_SCENE4 — put the black bowl in the bottom drawer of the cabinet and close it                         |
+| 2       | KITCHEN_SCENE6 — put the yellow and white mug in the microwave and close it                                  |
+| 3       | KITCHEN_SCENE8 — put both moka pots on the stove                                                             |
+| 4       | LIVING_ROOM_SCENE1 — put both the alphabet soup and the cream cheese box in the basket                       |
+| 5       | LIVING_ROOM_SCENE2 — put both the alphabet soup and the tomato sauce in the basket                           |
+| 6       | LIVING_ROOM_SCENE2 — put both the cream cheese box and the butter in the basket                              |
+| 7       | LIVING_ROOM_SCENE5 — put the white mug on the left plate and put the yellow and white mug on the right plate |
+| 8       | LIVING_ROOM_SCENE6 — put the white mug on the plate and put the chocolate pudding to the right of the plate  |
+| 9       | STUDY_SCENE1 — pick up the book and place it in the back compartment of the caddy                            |
+
+## Running Evaluation
+
+### Quick smoke test (1 episode)
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_on_hub> \
+    --env.type=robocerebra \
+    --env.task=libero_10 \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1 \
+    --eval.use_async_envs=false \
+    --policy.device=cuda
+```
+
+### Full benchmark (all 10 tasks, parallel envs)
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_on_hub> \
+    --env.type=robocerebra \
+    --env.task=libero_10 \
+    --eval.batch_size=10 \
+    --eval.n_episodes=50 \
+    --eval.use_async_envs=true \
+    --policy.device=cuda
+```
+
+### Evaluating a specific subset of tasks
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_on_hub> \
+    --env.type=robocerebra \
+    --env.task=libero_10 \
+    --env.task_ids="[0,1,2]" \
+    --eval.batch_size=1 \
+    --eval.n_episodes=10 \
+    --policy.device=cuda
+```
+
+### Custom camera name mapping
+
+By default, `RoboCerebraEnv` maps LIBERO camera names to match the dataset:
+
+| LIBERO camera              | Policy key                       |
+| -------------------------- | -------------------------------- |
+| `agentview_image`          | `observation.images.image`       |
+| `robot0_eye_in_hand_image` | `observation.images.wrist_image` |
+
+If your policy was trained with different camera names (e.g., `camera1`/`camera2`), override with:
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_on_hub> \
+    --env.type=robocerebra \
+    --env.task=libero_10 \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1 \
+    --policy.device=cuda \
+    '--env.camera_name_mapping={"agentview_image": "camera1", "robot0_eye_in_hand_image": "camera2"}'
+```
+
+> **Shell quoting note**: wrap the JSON mapping in single quotes on Linux/macOS, or use `^"..."^` on Windows cmd.
+
+## Configuration reference
+
+All fields of `RoboCerebraEnv` can be overridden via CLI:
+
+| Field                 | Default              | Description                                  |
+| --------------------- | -------------------- | -------------------------------------------- |
+| `task`                | `"libero_10"`        | LIBERO suite name                            |
+| `task_ids`            | `null` (all 10)      | List of task IDs to evaluate                 |
+| `fps`                 | `20`                 | Environment FPS (matches dataset)            |
+| `episode_length`      | `null` (520)         | Max steps per episode                        |
+| `obs_type`            | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"`           |
+| `observation_height`  | `256`                | Camera height in pixels                      |
+| `observation_width`   | `256`                | Camera width in pixels                       |
+| `camera_name_mapping` | see above            | LIBERO cam → policy key mapping              |
+| `control_mode`        | `"relative"`         | `"relative"` or `"absolute"` EEF control     |
+| `init_states`         | `true`               | Use fixed initial states for reproducibility |
+
+## Citation
+
+If you use RoboCerebra in your work, please cite:
+
+```bibtex
+@article{robocerebra2024,
+  title={RoboCerebra: A Long-Horizon Manipulation Benchmark for Evaluating Robotic Reasoning},
+  author={Qiu, Boxiang and others},
+  year={2024},
+  url={https://robocerebra.github.io}
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -175,6 +175,7 @@ video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]
 aloha = ["gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
 pusht = ["gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
 libero = ["lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
+robocerebra = ["lerobot[libero]"]  # RoboCerebra runs on LIBERO (Linux only)
 metaworld = ["metaworld==3.0.0", "lerobot[scipy-dep]"]
 
 # All
@@ -205,6 +206,7 @@ all = [
     "lerobot[pusht]",
     "lerobot[phone]",
     "lerobot[libero]; sys_platform == 'linux'",
+    "lerobot[robocerebra]; sys_platform == 'linux'",
     "lerobot[metaworld]",
     "lerobot[sarm]",
     "lerobot[peft]",