From 9a38d7de8fca065bffb0c1c17d68541f93498573 Mon Sep 17 00:00:00 2001 From: Ethan Look-Potts Date: Tue, 7 Apr 2026 17:55:51 -0400 Subject: [PATCH 1/5] ci: add brev launchable shellcheck and weekly e2e workflow --- .github/workflows/brev.yml | 236 +++++++++++++++++++++++++++ deployments/brev/disk-fill-test.yaml | 48 ++++++ 2 files changed, 284 insertions(+) create mode 100644 .github/workflows/brev.yml create mode 100644 deployments/brev/disk-fill-test.yaml diff --git a/.github/workflows/brev.yml b/.github/workflows/brev.yml new file mode 100644 index 000000000..a87d125fc --- /dev/null +++ b/.github/workflows/brev.yml @@ -0,0 +1,236 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +name: Brev Launchable + +on: + pull_request: + types: [opened, synchronize, reopened] + branches: [main, 'feature/**', 'release/**'] + paths: + - 'deployments/brev/**' + schedule: + - cron: '0 9 * * 1' # Every Monday at 9:00 AM UTC + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +############################ +# Shellcheck (on PR only) # +############################ +jobs: + shellcheck: + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Run shellcheck on setup.sh + uses: ludeeus/action-shellcheck@00cae500b08a931fb5698e11e79bfbd38e612a38 # 2.0.0 + with: + scandir: './deployments/brev' + severity: warning + additional_files: 'setup.sh' + +######################################### +# End-to-End (scheduled + manual only) # +######################################### + e2e: + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 300 + environment: brev-e2e + env: + INSTANCE_NAME: osmo-ci-${{ github.run_id }} + LAUNCHABLE_ID: env-36a6a7qnkOMOP2vgiBRaw2e3jpW + SSH_KEY: ${{ github.workspace }}/.brev/brev.pem + + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + # ── Brev setup ────────────────────────────────────────────────────────── + - name: Install Brev CLI + run: | + curl -sfL https://raw.githubusercontent.com/brevdev/brev-cli/main/bin/install-brev.sh | bash + echo "$HOME/.brev/bin" >> "$GITHUB_PATH" + + - name: Login to Brev + env: + BREV_API_TOKEN: ${{ secrets.BREV_API_TOKEN }} + run: brev login --token "$BREV_API_TOKEN" + + - name: Create Brev instance + run: | + brev create "$INSTANCE_NAME" --launchable "$LAUNCHABLE_ID" + echo "Instance $INSTANCE_NAME created" + + # ── Wait for OSMO to be ready ──────────────────────────────────────────── + - name: Wait for SSH availability + run: | + echo "Waiting for SSH on $INSTANCE_NAME..." + for i in $(seq 1 30); do + INSTANCE_IP=$(brev ls --output json \ + | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .dns // .ip // empty') + SSH_USER=$(brev ls --output json \ + | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .username // "ubuntu"') + if [ -n "$INSTANCE_IP" ] && \ + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ + "$SSH_USER@$INSTANCE_IP" "echo ok" 2>/dev/null; then + echo "INSTANCE_IP=$INSTANCE_IP" >> "$GITHUB_ENV" + echo "SSH_USER=$SSH_USER" >> "$GITHUB_ENV" + echo "SSH available at $SSH_USER@$INSTANCE_IP" + exit 0 + fi + echo "Attempt $i/30 — retrying in 30s..." + sleep 30 + done + echo "::error::Timed out waiting for SSH" + exit 1 + + - name: Wait for OSMO setup to complete + run: | + echo "Polling until all OSMO pods are Running..." + for i in $(seq 1 60); do + NOT_READY=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ + "$SSH_USER@$INSTANCE_IP" \ + "kubectl get pods -n osmo --no-headers 2>/dev/null \ + | awk '{print \$3}' | grep -vcE '^(Running|Completed)$'" 2>/dev/null || echo "error") + if [ "$NOT_READY" = "0" ]; then + echo "OSMO is ready" + exit 0 + fi + echo "Attempt $i/60 — $NOT_READY pods not ready, retrying in 30s..." + sleep 30 + done + echo "::error::OSMO setup did not complete within 30 minutes" + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ + "kubectl get pods -n osmo" || true + exit 1 + + # ── Version check ──────────────────────────────────────────────────────── + - name: Check OSMO version + id: version-check + continue-on-error: true + run: | + INSTALLED=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ + "$SSH_USER@$INSTANCE_IP" \ + "helm list -n osmo -o json \ + | jq -r '.[] | select(.name==\"osmo\") | .chart' \ + | sed 's/quick-start-//'") + LATEST=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ + "$SSH_USER@$INSTANCE_IP" \ + "helm repo update osmo 2>/dev/null; \ + helm search repo osmo/quick-start -o json | jq -r '.[0].version'") + echo "Installed OSMO chart version : $INSTALLED" + echo "Latest OSMO chart version : $LATEST" + echo "osmo_installed=$INSTALLED" >> "$GITHUB_STEP_SUMMARY" + echo "osmo_latest=$LATEST" >> "$GITHUB_STEP_SUMMARY" + if [ "$INSTALLED" != "$LATEST" ]; then + echo "::error::OSMO version mismatch — installed: $INSTALLED, latest: $LATEST" + exit 1 + fi + + # ── NGC credential ─────────────────────────────────────────────────────── + - name: Configure NGC registry credential + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + run: | + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ + "osmo credential set ci-ngc-cred \ + --type REGISTRY \ + --payload registry=nvcr.io \ + username='\$oauthtoken' \ + auth='$NGC_API_KEY'" + + # ── Test helper ───────────────────────────────────────────────────────── + # Polls osmo workflow query until terminal state or timeout, then dumps logs. + # Usage: poll_workflow + - name: Write workflow poll helper + run: | + cat << 'EOF' > /tmp/poll_workflow.sh + #!/bin/bash + set -euo pipefail + WORKFLOW=$1 + TIMEOUT_MIN=${2:-30} + SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no" + DEADLINE=$(( $(date +%s) + TIMEOUT_MIN * 60 )) + echo "Polling $WORKFLOW (timeout: ${TIMEOUT_MIN}m)..." + while true; do + STATUS=$(ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" \ + "osmo workflow query $WORKFLOW --format-type json 2>/dev/null \ + | jq -r '.status // \"UNKNOWN\"'") + echo " Status: $STATUS" + case "$STATUS" in + COMPLETED) + echo "::notice::$WORKFLOW completed successfully" + ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 50" || true + exit 0 ;; + FAILED*|CANCELED*) + echo "::error::$WORKFLOW failed with status $STATUS" + ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 200" || true + exit 1 ;; + esac + if [ "$(date +%s)" -ge "$DEADLINE" ]; then + echo "::error::$WORKFLOW timed out after ${TIMEOUT_MIN}m (status: $STATUS)" + exit 1 + fi + sleep 30 + done + EOF + chmod +x /tmp/poll_workflow.sh + + # ── Test 1: Hello World (CPU, basic smoke test) ────────────────────────── + - name: 'Test: hello_world' + run: | + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ + "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/tutorials/hello_world.yaml \ + | sed 's/name: hello-osmo/name: ci-hello-${{ github.run_id }}/' \ + | osmo workflow submit -" + /tmp/poll_workflow.sh "ci-hello-${{ github.run_id }}" 10 + + # ── Test 2: Isaac Sim SDG (GPU smoke test) ─────────────────────────────── + - name: 'Test: isaac_sim_sdg' + run: | + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ + "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/synthetic_data_generation/isaac_sim/isaac_sim_sdg.yaml \ + | sed 's/name: isaac-sim-sdg/name: ci-isaac-${{ github.run_id }}/' \ + | osmo workflow submit -" + /tmp/poll_workflow.sh "ci-isaac-${{ github.run_id }}" 60 + + # ── Test 3: Large image (disk-fill validation, not in cookbook) ────────── + - name: 'Test: large image (NeMo ~40GB)' + run: | + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ + "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/deployments/brev/disk-fill-test.yaml \ + | sed 's/name: disk-fill-test/name: ci-disk-${{ github.run_id }}/' \ + | osmo workflow submit -" + /tmp/poll_workflow.sh "ci-disk-${{ github.run_id }}" 90 + + # ── Assert version match (deferred so tests always run) ───────────────── + - name: Assert OSMO version is up to date + if: steps.version-check.outcome == 'failure' + run: | + echo "::error::OSMO version check failed — see 'Check OSMO version' step for details" + exit 1 + + # ── Cleanup (always runs) ──────────────────────────────────────────────── + - name: Delete Brev instance + if: always() + run: brev delete "$INSTANCE_NAME" --force || true diff --git a/deployments/brev/disk-fill-test.yaml b/deployments/brev/disk-fill-test.yaml new file mode 100644 index 000000000..1e5c33fc9 --- /dev/null +++ b/deployments/brev/disk-fill-test.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Reproduces the /var/lib/docker disk-fill issue on Brev instances with small root filesystems. +# nvcr.io/nvidia/pytorch:24.12-py3 is ~25 GB and requires an NGC registry credential. +# +# Prerequisites: +# Register your NGC API key once after OSMO setup: +# osmo credential set my-ngc-cred \ +# --type REGISTRY \ +# --payload registry=nvcr.io \ +# username='$oauthtoken' \ +# auth= +# OSMO will automatically use this credential when pulling from nvcr.io. +# +# To reproduce the bug: deploy brev launchable WITHOUT the data-root fix, then submit this workflow. +# Expected: workflow stuck in Pending/Failed; KIND cluster may become unresponsive. +# To verify the fix: deploy brev launchable WITH the data-root fix, then submit this workflow. +# Expected: workflow completes and prints PyTorch version. +# +# Submit with: +# osmo workflow submit disk-fill-test.yaml + +workflow: + name: disk-fill-test + resources: + default: + cpu: 1 + memory: 2Gi + storage: 1Gi + tasks: + - name: large-image + image: nvcr.io/nvidia/nemo:24.12 + command: ["python3"] + args: ["-c", "import nemo; print(f'NeMo {nemo.__version__} running on OSMO — disk fix verified')"] From ddd3178ef252f3a10f6de1981eb742fd6376e5bf Mon Sep 17 00:00:00 2001 From: Ethan Look-Potts Date: Wed, 8 Apr 2026 10:51:43 -0400 Subject: [PATCH 2/5] docs: add CI purpose docstring to disk-fill-test.yaml --- deployments/brev/disk-fill-test.yaml | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/deployments/brev/disk-fill-test.yaml b/deployments/brev/disk-fill-test.yaml index 1e5c33fc9..b72dd8b3b 100644 --- a/deployments/brev/disk-fill-test.yaml +++ b/deployments/brev/disk-fill-test.yaml @@ -14,25 +14,24 @@ # # SPDX-License-Identifier: Apache-2.0 -# Reproduces the /var/lib/docker disk-fill issue on Brev instances with small root filesystems. -# nvcr.io/nvidia/pytorch:24.12-py3 is ~25 GB and requires an NGC registry credential. +# CI validation workflow for the Brev launchable (deployments/brev/). # -# Prerequisites: -# Register your NGC API key once after OSMO setup: +# Purpose: +# Validates that the Docker data-root relocation in setup.sh correctly moves image +# storage off the root partition. Pulls nvcr.io/nvidia/nemo:24.12 (~40 GB), which is +# large enough to exhaust the root filesystem on a Brev instance if the fix is absent. +# A successful run confirms that image layers are written to the larger mounted disk. +# +# Used by: .github/workflows/brev.yml (weekly E2E job, "Test: large image" step) +# +# Manual use: +# Prerequisites — register your NGC API key once after OSMO setup: # osmo credential set my-ngc-cred \ # --type REGISTRY \ # --payload registry=nvcr.io \ # username='$oauthtoken' \ # auth= -# OSMO will automatically use this credential when pulling from nvcr.io. -# -# To reproduce the bug: deploy brev launchable WITHOUT the data-root fix, then submit this workflow. -# Expected: workflow stuck in Pending/Failed; KIND cluster may become unresponsive. -# To verify the fix: deploy brev launchable WITH the data-root fix, then submit this workflow. -# Expected: workflow completes and prints PyTorch version. -# -# Submit with: -# osmo workflow submit disk-fill-test.yaml +# Then: osmo workflow submit disk-fill-test.yaml workflow: name: disk-fill-test From 6ac125133a442b7ea295046c3d266ef47f26bb65 Mon Sep 17 00:00:00 2001 From: Ethan Look-Potts Date: Wed, 8 Apr 2026 12:16:37 -0400 Subject: [PATCH 3/5] ci: add bazel shellcheck target and brev job to pr-checks --- .github/workflows/brev.yml | 25 ------------- .github/workflows/pr-checks.yaml | 25 +++++++++++++ MODULE.bazel | 22 ++++++++++++ deployments/brev/BUILD | 54 +++++++++++++++++++++++++++++ deployments/brev/setup.sh | 13 ++++--- deployments/brev/shellcheck_test.sh | 20 +++++++++++ 6 files changed, 130 insertions(+), 29 deletions(-) create mode 100644 deployments/brev/BUILD create mode 100755 deployments/brev/shellcheck_test.sh diff --git a/.github/workflows/brev.yml b/.github/workflows/brev.yml index a87d125fc..66daaf489 100644 --- a/.github/workflows/brev.yml +++ b/.github/workflows/brev.yml @@ -17,11 +17,6 @@ name: Brev Launchable on: - pull_request: - types: [opened, synchronize, reopened] - branches: [main, 'feature/**', 'release/**'] - paths: - - 'deployments/brev/**' schedule: - cron: '0 9 * * 1' # Every Monday at 9:00 AM UTC workflow_dispatch: @@ -30,27 +25,7 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -############################ -# Shellcheck (on PR only) # -############################ jobs: - shellcheck: - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - - - name: Run shellcheck on setup.sh - uses: ludeeus/action-shellcheck@00cae500b08a931fb5698e11e79bfbd38e612a38 # 2.0.0 - with: - scandir: './deployments/brev' - severity: warning - additional_files: 'setup.sh' - -######################################### -# End-to-End (scheduled + manual only) # -######################################### e2e: if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml index 8c03f5572..ff7ecd38b 100644 --- a/.github/workflows/pr-checks.yaml +++ b/.github/workflows/pr-checks.yaml @@ -36,6 +36,7 @@ jobs: ci: ${{ steps.filter.outputs.ci }} docs: ${{ steps.filter.outputs.docs }} ui: ${{ steps.filter.outputs.ui }} + brev: ${{ steps.filter.outputs.brev }} steps: - name: Checkout uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 @@ -63,6 +64,10 @@ jobs: ui: - '.github/workflows/pr-checks.yaml' - 'src/ui/**' + brev: + - '.github/workflows/pr-checks.yaml' + - 'deployments/brev/**' + - 'MODULE.bazel' ####################### # CI Tests # @@ -255,6 +260,26 @@ jobs: echo "Host Docker disk:" docker system df 2>/dev/null || true + ####################### + # Brev Checks # + ####################### + brev: + needs: [check-paths] + if: needs.check-paths.outputs.brev == 'true' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Setup Bazel + uses: bazel-contrib/setup-bazel@4fd964a13a440a8aeb0be47350db2fc640f19ca8 + with: + bazelisk-cache: true + bazelisk-version: 1.27.0 + + - name: Run brev tests + run: bazel test --test_output=errors //deployments/brev/... + ####################### # Docs Build # ####################### diff --git a/MODULE.bazel b/MODULE.bazel index 9e1fe2b53..e6d988f11 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -64,6 +64,28 @@ osmo_constants( image_tag = IMAGE_TAG, ) +################ +# Shellcheck # +################ + +# Hermetic shellcheck binaries for sh_test rules in //deployments/brev/... +http_archive( + name = "shellcheck_linux_x86_64", + build_file_content = 'exports_files(["shellcheck"])', + sha256 = "6c881ab0698e4e6ea235245f22832860544f17ba386442fe7e9d629f8cbedf87", + strip_prefix = "shellcheck-v0.10.0", + url = "https://github.com/koalaman/shellcheck/releases/download/v0.10.0/shellcheck-v0.10.0.linux.x86_64.tar.xz", +) + +http_archive( + name = "shellcheck_darwin_arm64", + build_file_content = 'exports_files(["shellcheck"])', + sha256 = "bbd2f14826328eee7679da7221f2bc3afb011f6a928b848c80c321f6046ddf81", + strip_prefix = "shellcheck-v0.10.0", + url = "https://github.com/koalaman/shellcheck/releases/download/v0.10.0/shellcheck-v0.10.0.darwin.aarch64.tar.xz", +) + + ################ # Common # ################ diff --git a/deployments/brev/BUILD b/deployments/brev/BUILD new file mode 100644 index 000000000..37eb52b3c --- /dev/null +++ b/deployments/brev/BUILD @@ -0,0 +1,54 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +load("@rules_shell//shell:sh_test.bzl", "sh_test") + +config_setting( + name = "linux_x86_64", + constraint_values = [ + "@platforms//os:linux", + "@platforms//cpu:x86_64", + ], +) + +config_setting( + name = "macos_arm64", + constraint_values = [ + "@platforms//os:macos", + "@platforms//cpu:arm64", + ], +) + +sh_test( + name = "shellcheck", + srcs = ["shellcheck_test.sh"], + data = ["setup.sh"] + select({ + ":linux_x86_64": ["@shellcheck_linux_x86_64//:shellcheck"], + ":macos_arm64": ["@shellcheck_darwin_arm64//:shellcheck"], + }), + env = select({ + ":linux_x86_64": { + "SHELLCHECK": "$(location @shellcheck_linux_x86_64//:shellcheck)", + "SETUP_SH": "$(location setup.sh)", + }, + ":macos_arm64": { + "SHELLCHECK": "$(location @shellcheck_darwin_arm64//:shellcheck)", + "SETUP_SH": "$(location setup.sh)", + }, + }), +) diff --git a/deployments/brev/setup.sh b/deployments/brev/setup.sh index 53da0d5e1..81858751c 100644 --- a/deployments/brev/setup.sh +++ b/deployments/brev/setup.sh @@ -72,7 +72,7 @@ sudo sysctl -p print_status "Checking Docker permissions..." if ! docker ps >/dev/null 2>&1; then print_warning "Docker permission denied. Adding user to docker group..." - sudo usermod -aG docker $USER + sudo usermod -aG docker "$USER" print_warning "Please log out and log back in, then run this script again." exit 1 fi @@ -159,9 +159,10 @@ if [ "$current_version" = "0.0.0" ] || [ "$(printf '%s\n' "$NVIDIA_CTK_MIN_VERSI print_warning "nvidia-ctk version ${current_version} is below minimum ${NVIDIA_CTK_MIN_VERSION}, upgrading..." fi - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + # shellcheck source=/dev/null + distribution=$(. /etc/os-release;echo "$ID$VERSION_ID") curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - - curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + curl -s -L "https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list" | \ sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list sudo apt-get update @@ -254,12 +255,15 @@ if ! command_exists nvkind; then sudo rm -rf /usr/local/go sudo tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz export PATH=$PATH:/usr/local/go/bin + # shellcheck disable=SC2016 echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc fi print_status "Installing nvkind via go install..." go install github.com/NVIDIA/nvkind/cmd/nvkind@latest - export PATH=$PATH:$(go env GOPATH)/bin + GOPATH_BIN=$(go env GOPATH)/bin + export PATH="$PATH:$GOPATH_BIN" + # shellcheck disable=SC2016 echo 'export PATH=$PATH:$(go env GOPATH)/bin' >> ~/.bashrc cd .. else @@ -429,6 +433,7 @@ sudo bash install.sh # Add OSMO to PATH if not already there if [[ ":$PATH:" != *":$HOME/.osmo/bin:"* ]]; then export PATH="$HOME/.osmo/bin:$PATH" + # shellcheck disable=SC2016 echo 'export PATH="$HOME/.osmo/bin:$PATH"' >> ~/.bashrc fi diff --git a/deployments/brev/shellcheck_test.sh b/deployments/brev/shellcheck_test.sh new file mode 100755 index 000000000..66ce4107d --- /dev/null +++ b/deployments/brev/shellcheck_test.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +"$SHELLCHECK" --severity=warning "$SETUP_SH" From 38426c00062253db56ca9a5fcc5a3a31b05a215f Mon Sep 17 00:00:00 2001 From: Ethan Look-Potts Date: Thu, 9 Apr 2026 17:26:43 -0400 Subject: [PATCH 4/5] ci: add provider-compat job and brev launchable prompt --- .github/workflows/brev.yml | 254 ++++++++++++++----------------------- deployments/brev/README.md | 32 +++++ deployments/brev/prompt.md | 139 ++++++++++++++++++++ deployments/brev/setup.sh | 5 + 4 files changed, 273 insertions(+), 157 deletions(-) create mode 100644 deployments/brev/prompt.md diff --git a/.github/workflows/brev.yml b/.github/workflows/brev.yml index 66daaf489..98df60ee5 100644 --- a/.github/workflows/brev.yml +++ b/.github/workflows/brev.yml @@ -17,30 +17,45 @@ name: Brev Launchable on: - schedule: - - cron: '0 9 * * 1' # Every Monday at 9:00 AM UTC + # schedule: + # - cron: '0 9 * * 1' # Every Monday at 9:00 AM UTC — disabled until Brev supports API tokens workflow_dispatch: + inputs: + model: + description: 'Claude model for compatibility test' + default: 'aws/anthropic/claude-opus-4-5' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - e2e: - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + provider-compat: runs-on: ubuntu-latest - timeout-minutes: 300 - environment: brev-e2e - env: - INSTANCE_NAME: osmo-ci-${{ github.run_id }} - LAUNCHABLE_ID: env-36a6a7qnkOMOP2vgiBRaw2e3jpW - SSH_KEY: ${{ github.workspace }}/.brev/brev.pem + timeout-minutes: 360 + environment: brev-compat + permissions: + contents: write steps: - name: Checkout uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + token: ${{ secrets.SVC_OSMO_CI_TOKEN }} + fetch-depth: 0 + + # ── Secret masking ────────────────────────────────────────────────────── + - name: Mask secrets + run: | + echo "::add-mask::${{ secrets.NGC_SERVICE_KEY }}" + echo "::add-mask::${{ secrets.BREV_API_TOKEN }}" + + # ── Tool setup ────────────────────────────────────────────────────────── + - name: Setup Node.js + uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # v6.2.0 + with: + node-version: 20 - # ── Brev setup ────────────────────────────────────────────────────────── - name: Install Brev CLI run: | curl -sfL https://raw.githubusercontent.com/brevdev/brev-cli/main/bin/install-brev.sh | bash @@ -51,161 +66,86 @@ jobs: BREV_API_TOKEN: ${{ secrets.BREV_API_TOKEN }} run: brev login --token "$BREV_API_TOKEN" - - name: Create Brev instance + - name: Install Claude Code skills run: | - brev create "$INSTANCE_NAME" --launchable "$LAUNCHABLE_ID" - echo "Instance $INSTANCE_NAME created" + brev agent-skill install # installs /brev-cli — always latest + mkdir -p ~/.claude/skills + cp -r skills/osmo-agent ~/.claude/skills/ # /osmo-agent — from repo - # ── Wait for OSMO to be ready ──────────────────────────────────────────── - - name: Wait for SSH availability + - name: Configure git run: | - echo "Waiting for SSH on $INSTANCE_NAME..." - for i in $(seq 1 30); do - INSTANCE_IP=$(brev ls --output json \ - | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .dns // .ip // empty') - SSH_USER=$(brev ls --output json \ - | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .username // "ubuntu"') - if [ -n "$INSTANCE_IP" ] && \ - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ - "$SSH_USER@$INSTANCE_IP" "echo ok" 2>/dev/null; then - echo "INSTANCE_IP=$INSTANCE_IP" >> "$GITHUB_ENV" - echo "SSH_USER=$SSH_USER" >> "$GITHUB_ENV" - echo "SSH available at $SSH_USER@$INSTANCE_IP" - exit 0 - fi - echo "Attempt $i/30 — retrying in 30s..." - sleep 30 - done - echo "::error::Timed out waiting for SSH" - exit 1 - - - name: Wait for OSMO setup to complete - run: | - echo "Polling until all OSMO pods are Running..." - for i in $(seq 1 60); do - NOT_READY=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ - "$SSH_USER@$INSTANCE_IP" \ - "kubectl get pods -n osmo --no-headers 2>/dev/null \ - | awk '{print \$3}' | grep -vcE '^(Running|Completed)$'" 2>/dev/null || echo "error") - if [ "$NOT_READY" = "0" ]; then - echo "OSMO is ready" - exit 0 - fi - echo "Attempt $i/60 — $NOT_READY pods not ready, retrying in 30s..." - sleep 30 - done - echo "::error::OSMO setup did not complete within 30 minutes" - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ - "kubectl get pods -n osmo" || true - exit 1 - - # ── Version check ──────────────────────────────────────────────────────── - - name: Check OSMO version - id: version-check - continue-on-error: true + git config user.name "brev-compat[bot]" + git config user.email "brev-compat[bot]@users.noreply.github.com" + + # ── Build and run prompt ───────────────────────────────────────────────── + - name: Render prompt run: | - INSTALLED=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ - "$SSH_USER@$INSTANCE_IP" \ - "helm list -n osmo -o json \ - | jq -r '.[] | select(.name==\"osmo\") | .chart' \ - | sed 's/quick-start-//'") - LATEST=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \ - "$SSH_USER@$INSTANCE_IP" \ - "helm repo update osmo 2>/dev/null; \ - helm search repo osmo/quick-start -o json | jq -r '.[0].version'") - echo "Installed OSMO chart version : $INSTALLED" - echo "Latest OSMO chart version : $LATEST" - echo "osmo_installed=$INSTALLED" >> "$GITHUB_STEP_SUMMARY" - echo "osmo_latest=$LATEST" >> "$GITHUB_STEP_SUMMARY" - if [ "$INSTALLED" != "$LATEST" ]; then - echo "::error::OSMO version mismatch — installed: $INSTALLED, latest: $LATEST" - exit 1 - fi + sed \ + -e "s/{{GITHUB_RUN_ID}}/${{ github.run_id }}/g" \ + -e "s/{{GITHUB_SHA}}/${{ github.sha }}/g" \ + deployments/brev/prompt.md > "$RUNNER_TEMP/prompt.md" - # ── NGC credential ─────────────────────────────────────────────────────── - - name: Configure NGC registry credential + - name: Run compatibility matrix env: - NGC_API_KEY: ${{ secrets.NGC_API_KEY }} - run: | - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ - "osmo credential set ci-ngc-cred \ - --type REGISTRY \ - --payload registry=nvcr.io \ - username='\$oauthtoken' \ - auth='$NGC_API_KEY'" - - # ── Test helper ───────────────────────────────────────────────────────── - # Polls osmo workflow query until terminal state or timeout, then dumps logs. - # Usage: poll_workflow - - name: Write workflow poll helper - run: | - cat << 'EOF' > /tmp/poll_workflow.sh - #!/bin/bash - set -euo pipefail - WORKFLOW=$1 - TIMEOUT_MIN=${2:-30} - SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no" - DEADLINE=$(( $(date +%s) + TIMEOUT_MIN * 60 )) - echo "Polling $WORKFLOW (timeout: ${TIMEOUT_MIN}m)..." - while true; do - STATUS=$(ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" \ - "osmo workflow query $WORKFLOW --format-type json 2>/dev/null \ - | jq -r '.status // \"UNKNOWN\"'") - echo " Status: $STATUS" - case "$STATUS" in - COMPLETED) - echo "::notice::$WORKFLOW completed successfully" - ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 50" || true - exit 0 ;; - FAILED*|CANCELED*) - echo "::error::$WORKFLOW failed with status $STATUS" - ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 200" || true - exit 1 ;; - esac - if [ "$(date +%s)" -ge "$DEADLINE" ]; then - echo "::error::$WORKFLOW timed out after ${TIMEOUT_MIN}m (status: $STATUS)" - exit 1 - fi - sleep 30 - done - EOF - chmod +x /tmp/poll_workflow.sh - - # ── Test 1: Hello World (CPU, basic smoke test) ────────────────────────── - - name: 'Test: hello_world' + ANTHROPIC_API_KEY: ${{ secrets.NVIDIA_NIM_KEY }} + ANTHROPIC_BASE_URL: https://inference-api.nvidia.com + ANTHROPIC_MODEL: ${{ inputs.model || 'aws/anthropic/claude-opus-4-5' }} + DISABLE_PROMPT_CACHING: "1" + BREV_API_TOKEN: ${{ secrets.BREV_API_TOKEN }} + NGC_SERVICE_KEY: ${{ secrets.NGC_SERVICE_KEY }} run: | - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ - "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/tutorials/hello_world.yaml \ - | sed 's/name: hello-osmo/name: ci-hello-${{ github.run_id }}/' \ - | osmo workflow submit -" - /tmp/poll_workflow.sh "ci-hello-${{ github.run_id }}" 10 - - # ── Test 2: Isaac Sim SDG (GPU smoke test) ─────────────────────────────── - - name: 'Test: isaac_sim_sdg' + npx @anthropic-ai/claude-code@2.1.91 --print \ + --model "$ANTHROPIC_MODEL" \ + --allowedTools "Skill,Bash(brev *),Read,Write,Edit,Glob,Grep" \ + --max-turns 200 \ + "$(cat "$RUNNER_TEMP/prompt.md")" + + # ── Guardrail ──────────────────────────────────────────────────────────── + - name: Guardrail — README only run: | - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ - "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/synthetic_data_generation/isaac_sim/isaac_sim_sdg.yaml \ - | sed 's/name: isaac-sim-sdg/name: ci-isaac-${{ github.run_id }}/' \ - | osmo workflow submit -" - /tmp/poll_workflow.sh "ci-isaac-${{ github.run_id }}" 60 - - # ── Test 3: Large image (disk-fill validation, not in cookbook) ────────── - - name: 'Test: large image (NeMo ~40GB)' + CHANGED=$(git diff --name-only HEAD) + ALLOWED="deployments/brev/README.md" + UNEXPECTED=$(echo "$CHANGED" | grep -v "^$ALLOWED$" || true) + if [ -n "$UNEXPECTED" ]; then + echo "::error::Claude modified files outside README.md:" + echo "$UNEXPECTED" + git checkout -- . + exit 1 + fi + + # ── Commit results ─────────────────────────────────────────────────────── + - name: Commit README run: | - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \ - "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/deployments/brev/disk-fill-test.yaml \ - | sed 's/name: disk-fill-test/name: ci-disk-${{ github.run_id }}/' \ - | osmo workflow submit -" - /tmp/poll_workflow.sh "ci-disk-${{ github.run_id }}" 90 - - # ── Assert version match (deferred so tests always run) ───────────────── - - name: Assert OSMO version is up to date - if: steps.version-check.outcome == 'failure' + if git diff --quiet HEAD -- deployments/brev/README.md; then + echo "No README changes to commit" + else + git add deployments/brev/README.md + git commit -m "chore: update brev compatibility matrix [skip ci]" + git push origin HEAD:main + fi + + # ── Fail if any instance regressed ────────────────────────────────────── + - name: Check compatibility result run: | - echo "::error::OSMO version check failed — see 'Check OSMO version' step for details" - exit 1 + if [ ! -f compat-result.txt ]; then + echo "::error::compat-result.txt not written — Claude may have failed" + exit 1 + fi + RESULT=$(cat compat-result.txt) + echo "Compatibility result: $RESULT" + if [ "$RESULT" = "FAIL" ]; then + echo "::error::One or more providers failed OSMO compatibility — see README matrix" + exit 1 + fi - # ── Cleanup (always runs) ──────────────────────────────────────────────── - - name: Delete Brev instance + # ── Cleanup (always) ───────────────────────────────────────────────────── + - name: Delete brev instances if: always() - run: brev delete "$INSTANCE_NAME" --force || true + env: + BREV_API_TOKEN: ${{ secrets.BREV_API_TOKEN }} + run: | + brev login --token "$BREV_API_TOKEN" 2>/dev/null || true + brev ls 2>/dev/null \ + | grep "osmo-compat-${{ github.run_id }}" \ + | awk '{print $1}' \ + | xargs -r -I{} sh -c 'brev delete {} || true' diff --git a/deployments/brev/README.md b/deployments/brev/README.md index 500c48a5f..0a49f7a03 100644 --- a/deployments/brev/README.md +++ b/deployments/brev/README.md @@ -30,6 +30,38 @@ The OSMO Brev deployment provides a pre-configured OSMO instance running in the - NVIDIA Container Toolkit (>=1.18.1) - NVIDIA Driver Version (>=575) +### Compatibility Matrix + + + +Last updated: 2026-04-09 + +| Provider | Instance Type | GPU | Hello World | Disk Fill | GPU Workload | Notes | +|----------|---------------|-----|-------------|-----------|--------------|-------| +| massedcompute | massedcompute_L40S | L40S 1× | ✅ | ✅ | ✅ | | +| massedcompute | massedcompute_L40 | L40 1× | ✅ | ✅ | ✅ | | +| hyperstack | hyperstack_L40 | L40 1× | ✅ | ✅ | ✅ | Driver <575 min | +| verda | verda_L40S | L40S 1× | ✅ | ✅ | ✅ | | +| scaleway | scaleway_L40S | L40S 1× | ✅ | ✅ | ✅ | Driver <575 min | +| crusoe | l40s-48gb.1x | L40S 1× | ✅ | ✅ | ❌ | nvidia-cdi-refresh failed; GPU not exposed | +| nebius | gpu-l40s-a.1gpu-8vcpu-32gb | L40S 1× | ❌ | ❌ | ❌ | Docker not pre-installed | +| aws | g6e.xlarge | L40S 1× | ❌ | ❌ | ❌ | brev SSH failure | + +**Test definitions:** +- **Hello World** — `ubuntu:22.04`, 1 CPU / 1Gi memory / 0 GPU +- **Disk Fill** — `nvcr.io/nvidia/nemo:24.12` (~40 GB); validates Docker data-root relocation +- **GPU Workload** — verifies GPU is exposed in the default pool, then runs MNIST CNN on `nvcr.io/nvidia/pytorch:24.03-py3` + +**Status codes:** ✅ · ❌ · `—` (not applicable) + + + + + ## Accessing the Brev Deployment ### Web UI Access diff --git a/deployments/brev/prompt.md b/deployments/brev/prompt.md new file mode 100644 index 000000000..f1e6eccdb --- /dev/null +++ b/deployments/brev/prompt.md @@ -0,0 +1,139 @@ +# OSMO Brev Provider Compatibility Test + +Run a compatibility test of the OSMO Brev launchable across GPU providers. No user interaction — proceed through all phases without interruption. + +## Environment + +| Variable | Value | +|----------|-------| +| Instance prefix | `osmo-compat-{{GITHUB_RUN_ID}}` | +| OSMO binary (on instances) | `/usr/local/bin/osmo` | +| NGC credential name | `ci-ngc-cred` | +| NGC key env var (CI runner) | `NGC_SERVICE_KEY` | +| setup.sh (local path) | `deployments/brev/setup.sh` | +| disk-fill workflow (local path) | `deployments/brev/disk-fill-test.yaml` | +| Hello world workflow URL | `https://raw.githubusercontent.com/NVIDIA/OSMO/{{GITHUB_SHA}}/cookbook/tutorials/hello_world.yaml` | +| GPU workflow URL | `https://raw.githubusercontent.com/NVIDIA/OSMO/{{GITHUB_SHA}}/cookbook/dnn_training/single_node/train.yaml` | +| GPU training script URL | `https://raw.githubusercontent.com/NVIDIA/OSMO/{{GITHUB_SHA}}/cookbook/dnn_training/single_node/train.py` | + +## Skill usage + +Use `/brev-cli` for **all** brev operations (search, create, exec, delete, status). + +OSMO runs on remote instances — never locally. Consult `/osmo-agent` for OSMO CLI +syntax only, then pass those commands through `brev exec`. Do not hardcode brev or +OSMO CLI flags; delegate through these skills so this prompt stays stable as CLIs evolve. + +## Phase 1 — Discover instance types + +Use `/brev-cli` to search for available instances. Target: +- All available **L40** instance types across all providers (1 GPU each) +- All available **L40S** instance types across all providers (1 GPU each) + +For each unique provider+GPU combination select the cheapest available type. +Present a candidate table (provider, type, GPU, disk, $/hr) and proceed immediately +without waiting for confirmation. + +## Phase 2 — Create instances (parallel) + +Create all instances in parallel. Name each: +`osmo-compat-{{GITHUB_RUN_ID}}--` +(e.g. `osmo-compat-{{GITHUB_RUN_ID}}-hyperstack-a100-2g`) + +Do **not** use `--startup-script` — setup.sh exceeds the 16 KB limit. +Create bare instances, then run setup.sh via `brev exec @file` once READY. + +If creation fails for a specific type, retry up to 3 times before giving up. +After 3 failed attempts, record all tests as `❌` with the error note. + +## Phase 3 — Setup each instance (parallel across instances) + +For each successfully created instance: + +### 3a. Run setup script +``` +brev exec @deployments/brev/setup.sh +``` +This installs Docker, KIND, GPU operator, KAI scheduler, and OSMO (~15 min). + +### 3b. Wait for OSMO pods +Poll until all pods in namespace `osmo` are Running/Completed (max 30 min). + +### 3c. Configure NGC credential +Pass the key via the runner's environment variable — never print it: +```bash +brev exec -- bash -c \ + "osmo credential set ci-ngc-cred --type REGISTRY \ + --payload registry=nvcr.io username='\$oauthtoken' auth='$NGC_SERVICE_KEY'" +``` + +## Phase 4 — Test suite (parallel across instances, sequential per instance) + +If any step fails for an instance, record `❌` and skip all remaining steps +for that instance. Run instances in parallel. + +### Test A: Hello World (CPU) + +Fetch the hello world workflow from the cookbook URL above, copy to the instance, +and submit it. Parse the workflow ID from `osmo workflow submit` stdout: +``` +Workflow ID - +``` +Poll by that specific ID (not by name) every 30 s. Timeout: 15 min. + +Record: ✅ / ❌ + +### Test B: Disk Fill (~40 GB) + +Copy `deployments/brev/disk-fill-test.yaml` to the instance, submit it. +This pulls `nvcr.io/nvidia/nemo:24.12` using `ci-ngc-cred`, validating that the +Docker data-root relocation in setup.sh prevents root-partition exhaustion. +Poll by workflow ID. Timeout: 90 min. + +Record: ✅ / ❌ + +### Test C: GPU Workload + +First run `osmo pool list` and verify at least one GPU is available in the default +pool. If not, record `❌` with note "no GPUs in default pool" and stop — do +not submit the workflow. + +Otherwise, fetch the GPU training workflow and script from the cookbook URLs above. +Copy both to the instance. Submit the workflow. +Poll by workflow ID. Timeout: 30 min. + +If `brev exec` fails with a ControlPath/socket-path error for a specific instance, +record `❌` with note "brev SSH socket-path error". + +Record: ✅ / ❌ / `—` (CPU-only instance, GPU test not applicable) + +## Phase 5 — Teardown + +Delete all instances created in Phase 2 after tests complete. Always delete, even +if earlier phases failed. Do not ask for confirmation. + +## Phase 6 — Write results + +Update `deployments/brev/README.md`. Find the `## Compatibility Matrix` section and +replace the table with current results. Preserve all text outside that section. + +Table columns: +``` +| Provider | Instance Type | GPU | Hello World | Disk Fill | GPU Workload | Notes | +``` + +Set "Last updated" to today's date. Sort rows so fully-passing instances appear +first, partial failures next, and fully-failed instances last. Keep Notes brief +(≤8 words); omit notes for fully-passing instances. + +Status codes: +- ✅ — test passed +- ❌ — failure; include a note explaining the cause (OSMO bug, brev SSH error, + instance creation failure, etc.) +- `—` — not applicable (e.g. GPU test on a CPU-only instance) + +After writing README.md, compare results against the previous matrix that was in +`deployments/brev/README.md` before this run. Write a single line to +`compat-result.txt` in the current working directory: +- `FAIL` if any instance that previously had ✅ now has ❌ +- `PASS` otherwise (new instances and previously-failing instances do not affect the result) diff --git a/deployments/brev/setup.sh b/deployments/brev/setup.sh index 81858751c..ddbf41653 100644 --- a/deployments/brev/setup.sh +++ b/deployments/brev/setup.sh @@ -199,6 +199,11 @@ while IFS= read -r line; do case "$MNT" in /dev|/dev/*|/proc|/sys|/sys/*|/run|/run/*|/boot|/boot/*|/snap/*) continue ;; esac + # Skip read-only filesystems (e.g. /mnt/cloud-metadata on Nebius) + if ! sudo mkdir -p "$MNT/.docker_write_test" 2>/dev/null; then + continue + fi + sudo rmdir "$MNT/.docker_write_test" 2>/dev/null || true if [ "$AVAIL" -gt "$DOCKER_DATA_ROOT_AVAIL" ] 2>/dev/null; then DOCKER_DATA_ROOT_AVAIL=$AVAIL DOCKER_DATA_ROOT_MOUNT=$MNT From 0eeee45e9103c5618de2ea23cb981fb35ad7578a Mon Sep 17 00:00:00 2001 From: Ethan Look-Potts Date: Thu, 9 Apr 2026 17:58:15 -0400 Subject: [PATCH 5/5] fix: update instance name example to use L40 slug --- deployments/brev/prompt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/brev/prompt.md b/deployments/brev/prompt.md index f1e6eccdb..fddf1fd55 100644 --- a/deployments/brev/prompt.md +++ b/deployments/brev/prompt.md @@ -38,7 +38,7 @@ without waiting for confirmation. Create all instances in parallel. Name each: `osmo-compat-{{GITHUB_RUN_ID}}--` -(e.g. `osmo-compat-{{GITHUB_RUN_ID}}-hyperstack-a100-2g`) +(e.g. `osmo-compat-{{GITHUB_RUN_ID}}-hyperstack-l40-1g`) Do **not** use `--startup-script` — setup.sh exceeds the 16 KB limit. Create bare instances, then run setup.sh via `brev exec @file` once READY.