NVIDIA · elookpotts-nvidia · Apr 7, 2026 · Apr 8, 2026 · coderabbitai · Apr 8, 2026
@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Brev Launchable
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches: [main, 'feature/**', 'release/**']
+    paths:
+      - 'deployments/brev/**'
+  schedule:
+    - cron: '0 9 * * 1'  # Every Monday at 9:00 AM UTC
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+############################
+#  Shellcheck (on PR only) #
+############################
+jobs:
+  shellcheck:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      - name: Run shellcheck on setup.sh
+        uses: ludeeus/action-shellcheck@00cae500b08a931fb5698e11e79bfbd38e612a38 # 2.0.0
+        with:
+          scandir: './deployments/brev'
+          severity: warning
+          additional_files: 'setup.sh'
+
+#########################################
+#  End-to-End (scheduled + manual only) #
+#########################################
+  e2e:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    timeout-minutes: 300
+    environment: brev-e2e
+    env:
+      INSTANCE_NAME: osmo-ci-${{ github.run_id }}
+      LAUNCHABLE_ID: env-36a6a7qnkOMOP2vgiBRaw2e3jpW
+      SSH_KEY: ${{ github.workspace }}/.brev/brev.pem
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      # ── Brev setup ──────────────────────────────────────────────────────────
+      - name: Install Brev CLI
+        run: |
+          curl -sfL https://raw.githubusercontent.com/brevdev/brev-cli/main/bin/install-brev.sh | bash
+          echo "$HOME/.brev/bin" >> "$GITHUB_PATH"
+
+      - name: Login to Brev
+        env:
+          BREV_API_TOKEN: ${{ secrets.BREV_API_TOKEN }}
+        run: brev login --token "$BREV_API_TOKEN"
+
+      - name: Create Brev instance
+        run: |
+          brev create "$INSTANCE_NAME" --launchable "$LAUNCHABLE_ID"
+          echo "Instance $INSTANCE_NAME created"
+
+      # ── Wait for OSMO to be ready ────────────────────────────────────────────
+      - name: Wait for SSH availability
+        run: |
+          echo "Waiting for SSH on $INSTANCE_NAME..."
+          for i in $(seq 1 30); do
+            INSTANCE_IP=$(brev ls --output json \
+              | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .dns // .ip // empty')
+            SSH_USER=$(brev ls --output json \
+              | jq -r --arg n "$INSTANCE_NAME" '.[] | select(.name==$n) | .username // "ubuntu"')
+            if [ -n "$INSTANCE_IP" ] && \
+               ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
+                   "$SSH_USER@$INSTANCE_IP" "echo ok" 2>/dev/null; then
+              echo "INSTANCE_IP=$INSTANCE_IP" >> "$GITHUB_ENV"
+              echo "SSH_USER=$SSH_USER" >> "$GITHUB_ENV"
+              echo "SSH available at $SSH_USER@$INSTANCE_IP"
+              exit 0
+            fi
+            echo "Attempt $i/30 — retrying in 30s..."
+            sleep 30
+          done
+          echo "::error::Timed out waiting for SSH"
+          exit 1
+
+      - name: Wait for OSMO setup to complete
+        run: |
+          echo "Polling until all OSMO pods are Running..."
+          for i in $(seq 1 60); do
+            NOT_READY=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \
+              "$SSH_USER@$INSTANCE_IP" \
+              "kubectl get pods -n osmo --no-headers 2>/dev/null \
+               | awk '{print \$3}' | grep -vcE '^(Running|Completed)$'" 2>/dev/null || echo "error")
+            if [ "$NOT_READY" = "0" ]; then
+              echo "OSMO is ready"
+              exit 0
+            fi
+            echo "Attempt $i/60 — $NOT_READY pods not ready, retrying in 30s..."
+            sleep 30
+          done
+          echo "::error::OSMO setup did not complete within 30 minutes"
+          ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \
+            "kubectl get pods -n osmo" || true
+          exit 1
+
+      # ── Version check ────────────────────────────────────────────────────────
+      - name: Check OSMO version
+        id: version-check
+        continue-on-error: true
+        run: |
+          INSTALLED=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \
+            "$SSH_USER@$INSTANCE_IP" \
+            "helm list -n osmo -o json \
+             | jq -r '.[] | select(.name==\"osmo\") | .chart' \
+             | sed 's/quick-start-//'")
+          LATEST=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no \
+            "$SSH_USER@$INSTANCE_IP" \
+            "helm repo update osmo 2>/dev/null; \
+             helm search repo osmo/quick-start -o json | jq -r '.[0].version'")
+          echo "Installed OSMO chart version : $INSTALLED"
+          echo "Latest OSMO chart version    : $LATEST"
+          echo "osmo_installed=$INSTALLED" >> "$GITHUB_STEP_SUMMARY"
+          echo "osmo_latest=$LATEST" >> "$GITHUB_STEP_SUMMARY"
+          if [ "$INSTALLED" != "$LATEST" ]; then
+            echo "::error::OSMO version mismatch — installed: $INSTALLED, latest: $LATEST"
+            exit 1
+          fi
+
+      # ── NGC credential ───────────────────────────────────────────────────────
+      - name: Configure NGC registry credential
+        env:
+          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+        run: |
+          ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \
+            "osmo credential set ci-ngc-cred \
+               --type REGISTRY \
+               --payload registry=nvcr.io \
+               username='\$oauthtoken' \
+               auth='$NGC_API_KEY'"
+
+      # ── Test helper ─────────────────────────────────────────────────────────
+      # Polls osmo workflow query until terminal state or timeout, then dumps logs.
+      # Usage: poll_workflow <workflow-name> <timeout-minutes>
+      - name: Write workflow poll helper
+        run: |
+          cat << 'EOF' > /tmp/poll_workflow.sh
+          #!/bin/bash
+          set -euo pipefail
+          WORKFLOW=$1
+          TIMEOUT_MIN=${2:-30}
+          SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no"
+          DEADLINE=$(( $(date +%s) + TIMEOUT_MIN * 60 ))
+          echo "Polling $WORKFLOW (timeout: ${TIMEOUT_MIN}m)..."
+          while true; do
+            STATUS=$(ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" \
+              "osmo workflow query $WORKFLOW --format-type json 2>/dev/null \
+               | jq -r '.status // \"UNKNOWN\"'")
+            echo "  Status: $STATUS"
+            case "$STATUS" in
+              COMPLETED)
+                echo "::notice::$WORKFLOW completed successfully"
+                ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 50" || true
+                exit 0 ;;
+              FAILED*|CANCELED*)
+                echo "::error::$WORKFLOW failed with status $STATUS"
+                ssh $SSH_OPTS "$SSH_USER@$INSTANCE_IP" "osmo workflow logs $WORKFLOW -n 200" || true
+                exit 1 ;;
+            esac
+            if [ "$(date +%s)" -ge "$DEADLINE" ]; then
+              echo "::error::$WORKFLOW timed out after ${TIMEOUT_MIN}m (status: $STATUS)"
+              exit 1
+            fi
+            sleep 30
+          done
+          EOF
+          chmod +x /tmp/poll_workflow.sh
+
+      # ── Test 1: Hello World (CPU, basic smoke test) ──────────────────────────
+      - name: 'Test: hello_world'
+        run: |
+          ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \
+            "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/tutorials/hello_world.yaml \
+               | sed 's/name: hello-osmo/name: ci-hello-${{ github.run_id }}/' \
+               | osmo workflow submit -"
+          /tmp/poll_workflow.sh "ci-hello-${{ github.run_id }}" 10
+
+      # ── Test 2: Isaac Sim SDG (GPU smoke test) ───────────────────────────────
+      - name: 'Test: isaac_sim_sdg'
+        run: |
+          ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \
+            "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/cookbook/synthetic_data_generation/isaac_sim/isaac_sim_sdg.yaml \
+               | sed 's/name: isaac-sim-sdg/name: ci-isaac-${{ github.run_id }}/' \
+               | osmo workflow submit -"
+          /tmp/poll_workflow.sh "ci-isaac-${{ github.run_id }}" 60
+
+      # ── Test 3: Large image (disk-fill validation, not in cookbook) ──────────
+      - name: 'Test: large image (NeMo ~40GB)'
+        run: |
+          ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "$SSH_USER@$INSTANCE_IP" \
+            "curl -sfL https://raw.githubusercontent.com/NVIDIA/OSMO/${{ github.sha }}/deployments/brev/disk-fill-test.yaml \
+               | sed 's/name: disk-fill-test/name: ci-disk-${{ github.run_id }}/' \
+               | osmo workflow submit -"
+          /tmp/poll_workflow.sh "ci-disk-${{ github.run_id }}" 90
+
+      # ── Assert version match (deferred so tests always run) ─────────────────
+      - name: Assert OSMO version is up to date
+        if: steps.version-check.outcome == 'failure'
+        run: |
+          echo "::error::OSMO version check failed — see 'Check OSMO version' step for details"
+          exit 1
+
+      # ── Cleanup (always runs) ────────────────────────────────────────────────
+      - name: Delete Brev instance
+        if: always()
+        run: brev delete "$INSTANCE_NAME" --force || true
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# CI validation workflow for the Brev launchable (deployments/brev/).
+#
+# Purpose:
+#   Validates that the Docker data-root relocation in setup.sh correctly moves image
+#   storage off the root partition. Pulls nvcr.io/nvidia/nemo:24.12 (~40 GB), which is
+#   large enough to exhaust the root filesystem on a Brev instance if the fix is absent.
+#   A successful run confirms that image layers are written to the larger mounted disk.
+#
+# Used by: .github/workflows/brev.yml (weekly E2E job, "Test: large image" step)
+#
+# Manual use:
+#   Prerequisites — register your NGC API key once after OSMO setup:
+#     osmo credential set my-ngc-cred \
+#         --type REGISTRY \
+#         --payload registry=nvcr.io \
+#         username='$oauthtoken' \
+#         auth=<your_ngc_api_key>
+#   Then: osmo workflow submit disk-fill-test.yaml
+
+workflow:
+  name: disk-fill-test
+  resources:
+    default:
+      cpu: 1
+      memory: 2Gi
+      storage: 1Gi
+  tasks:
+  - name: large-image
+    image: nvcr.io/nvidia/nemo:24.12
+    command: ["python3"]
+    args: ["-c", "import nemo; print(f'NeMo {nemo.__version__} running on OSMO — disk fix verified')"]