Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
82d0ff9
Refactor and fix data-table variable height rows (#667)
fernandol-nvidia Mar 10, 2026
88fd200
Ethany/support large workflows (#655)
ethany-nv Mar 10, 2026
211404d
Add default filter (Status) to occupancy page and compact bytes unit …
fernandol-nvidia Mar 10, 2026
00b527c
Misc UI improvements and polish (#669)
fernandol-nvidia Mar 10, 2026
122af26
new fix for prometheus ports (#670)
cypres Mar 10, 2026
5501daa
Catch error when logs time out (#662)
RyaliNvidia Mar 10, 2026
9cc68cd
prometheus: redo fix by using dev mode (#673)
cypres Mar 10, 2026
2fff7c4
Fix auto-refresh on workflow details page (#671)
fernandol-nvidia Mar 11, 2026
d09e694
Fix bad cli call in the docs (#679)
RyaliNvidia Mar 11, 2026
cf902f3
Fix PYTHONPATH to include osmo_workspace+ from runfiles (#671) (#672)
tdewanNvidia Mar 11, 2026
b53f053
Ecolter/fix deploy scripts aws (#571)
ecolternv Mar 11, 2026
e58bd4f
Speedup UpdateGroup Jobs (#676)
ethany-nv Mar 11, 2026
5c7deab
readme: add pointer to azure reference arch (#685)
adelbertc Mar 11, 2026
a882e83
Rudimentary secret redaction (#678)
cypres Mar 11, 2026
4cc76c2
Occupancy Page Cross Linking and UI Polish (#677)
fernandol-nvidia Mar 11, 2026
71969cc
Fix: Connection test to not just check 200 (#688)
tdewanNvidia Mar 11, 2026
bcb07fa
Fix auxilary pod cleanup (#687)
tdewanNvidia Mar 11, 2026
ba252dc
Fix authz sidecar evaluating tokens (#683)
RyaliNvidia Mar 11, 2026
a484225
fix backend listener (#681)
xutongNV Mar 11, 2026
a684427
Add some color to the filters and update occupancy links (#689)
RyaliNvidia Mar 11, 2026
19c3e76
Pool Quota: Address Case when Pools have No Nodes (#691)
ethany-nv Mar 11, 2026
5edeab4
#686 - Add verbose option for config show POOL (#690)
ecolternv Mar 11, 2026
fdb0014
Add Workflow Requirements and detailed codebase index to AGENTS.md (#…
jiaenren Mar 12, 2026
97c7db5
Add datetime filter for Workflows Page (#693)
fernandol-nvidia Mar 12, 2026
d1982a9
Add --offset option to workflow list (#696)
ecolternv Mar 12, 2026
7bd98c8
#357 - Group template unit tests (#697)
ecolternv Mar 12, 2026
2f8b0ca
Merge branch 'main' into xutongr/sync
xutongNV Mar 12, 2026
c999894
check out main src/service/core/tests/
xutongNV Mar 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .coderabbit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ reviews:
profile: chill # assertive / chill profile
auto_review:
enabled: true # Enable auto-review for this repository
path_instructions:
- path: "src/**"
instructions: "If this PR adds, removes, or renames a service, module, or major component, check that AGENTS.md is updated accordingly."
197 changes: 190 additions & 7 deletions AGENTS.md

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@AGENTS.md

## Tool Usage Preferences

- Use specialized tools (Read, Edit, Write, Grep, Glob) instead of Bash commands whenever possible
- Bash tools require user intervention to allow and should only be used as a last resort
- Prefer Read over cat, Edit over sed, Write over echo/heredoc, Grep over grep, and Glob over find
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ Select one of the deployment options below depending on your needs and environme
</a>
</div>

**Deploying on Microsoft Azure?** Get started with [Azure NVIDIA Reference Architecture](https://github.com/microsoft/physical-ai-toolchain). This jointly published architecture delivers a production-ready Physical AI pipeline on Microsoft Azure, integrating OSMO, Isaac Lab, and Isaac Sim with GPU-accelerated RL training, auto-scaling compute, and enterprise-grade Kubernetes security.


## Documentation
Expand Down
6 changes: 5 additions & 1 deletion bzl/py.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,14 @@ import glob
pythonpath = ["{runfiles_dir}/_main"]
local_runfiles_dir = "{runfiles_dir}"
local_main_dir = "{runfiles_dir}/_main"

if os.path.isdir("/osmo_workspace+"):
local_runfiles_dir = "/osmo_workspace+" + local_runfiles_dir
local_main_dir = local_runfiles_dir + "/osmo_workspace+"
pythonpath.append(local_runfiles_dir + "/osmo_workspace+")

osmo_ws_path = local_runfiles_dir + "/osmo_workspace+"
if os.path.isdir(osmo_ws_path):
pythonpath.append(osmo_ws_path)

# Add all site-packages directories
site_packages = glob.glob(local_runfiles_dir + "/rules_python++pip+*/site-packages")
Expand Down
2 changes: 1 addition & 1 deletion deployments/charts/router/templates/_sidecar-helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ OAuth2 Proxy sidecar container
ports:
- name: http
containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }}
- name: metrics
- name: oauth2-metrics
containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }}
livenessProbe:
httpGet:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ OAuth2 Proxy sidecar container
ports:
- name: http
containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }}
- name: metrics
- name: oauth2-metrics
containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }}
livenessProbe:
httpGet:
Expand Down
5 changes: 5 additions & 0 deletions deployments/charts/service/templates/api-monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ spec:
port: envoy-admin
path: /stats/prometheus
{{- end }}
{{- if .Values.sidecars.oauth2Proxy.enabled }}
- interval: 15s
port: oauth2-metrics
path: /metrics
{{- end }}
selector:
matchExpressions:
- { key: app,
Expand Down
2 changes: 1 addition & 1 deletion deployments/charts/web-ui/templates/_sidecar-helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ OAuth2 Proxy sidecar container
ports:
- name: http
containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }}
- name: metrics
- name: oauth2-metrics
containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }}
livenessProbe:
httpGet:
Expand Down
30 changes: 30 additions & 0 deletions deployments/scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ The main entry point for deploying OSMO. This script orchestrates:
| `--destroy` | Destroy all resources |
| `--dry-run` | Show what would be done without making changes |
| `--non-interactive` | Fail if required parameters are missing (for CI/CD) |
| `--ngc-api-key` | NGC API key for pulling images and Helm charts from `nvcr.io` (optional) |
| `-h, --help` | Show help message |

#### Azure-specific Options
Expand Down Expand Up @@ -221,18 +222,47 @@ The script will prompt for:
--aws-region "us-west-2" \
--cluster-name "osmo-aws" \
--postgres-password "SecurePass123!" \
--redis-password "SecureRedisToken123!" \
--non-interactive
```

> **Note:** Keep cluster names short (≤12 characters) to avoid AWS IAM role name length limits.

### Deployment with NGC Registry Credentials

Required when pulling OSMO images and Helm charts from a private registry in `nvcr.io`.

```bash
# Via flag
./deploy-osmo-minimal.sh --provider aws \
--aws-region "us-west-2" \
--cluster-name "osmo-aws" \
--postgres-password "SecurePass123!" \
--redis-password "SecureRedisToken123!" \
--ngc-api-key "$NGC_API_KEY"

# Via environment variable
export NGC_API_KEY="your-ngc-api-key"
./deploy-osmo-minimal.sh --provider aws \
--aws-region "us-west-2" \
--cluster-name "osmo-aws" \
--postgres-password "SecurePass123!" \
--redis-password "SecureRedisToken123!"
```

When an NGC API key is provided, the script:
1. Authenticates `helm repo add` with `--username='$oauthtoken' --password=<NGC_API_KEY>`
2. Creates a `nvcr-secret` docker-registry secret in all three namespaces
3. Configures all Helm charts to use `nvcr-secret` as the image pull secret

## Environment Variables

| Variable | Description | Default |
|----------|-------------|---------|
| `OSMO_IMAGE_REGISTRY` | OSMO Docker image registry | `nvcr.io/nvidia/osmo` |
| `OSMO_IMAGE_TAG` | OSMO Docker image tag | `latest` |
| `BACKEND_TOKEN_EXPIRY` | Backend operator token expiry | `2027-01-01` |
| `NGC_API_KEY` | NGC API key for `nvcr.io` image and Helm chart pulls | - |
| `TF_SUBSCRIPTION_ID` | Azure subscription ID | - |
| `TF_RESOURCE_GROUP` | Azure resource group | - |
| `TF_POSTGRES_PASSWORD` | PostgreSQL password | - |
Expand Down
2 changes: 1 addition & 1 deletion deployments/scripts/aws/terraform.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ node_group_max_size = 5
node_group_desired_size = 3

# RDS Configuration
rds_engine_version = "15.4"
rds_engine_version = "15.12"
rds_instance_class = "db.t3.medium"
rds_db_name = "osmo"
rds_username = "postgres"
Expand Down
67 changes: 61 additions & 6 deletions deployments/scripts/deploy-k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ OSMO_WORKFLOWS_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}"
OSMO_IMAGE_REGISTRY="${OSMO_IMAGE_REGISTRY:-nvcr.io/nvidia/osmo}"
OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-latest}"
BACKEND_TOKEN_EXPIRY="${BACKEND_TOKEN_EXPIRY:-2027-01-01}"
NGC_API_KEY="${NGC_API_KEY:-}"
NGC_SECRET_NAME="nvcr-secret"

# Provider-specific settings (set by loading provider script)
PROVIDER=""
Expand Down Expand Up @@ -94,6 +96,10 @@ parse_k8s_args() {
DRY_RUN=true
shift
;;
--ngc-api-key)
NGC_API_KEY="$2"
shift 2
;;
*)
shift
;;
Expand Down Expand Up @@ -286,6 +292,34 @@ data:
log_success "Secrets created"
}

create_image_pull_secrets() {
if [[ -z "$NGC_API_KEY" ]]; then
log_info "NGC_API_KEY not set, skipping image pull secret creation"
return
fi

log_info "Creating NGC image pull secrets..."

if [[ "$DRY_RUN" == true ]]; then
log_info "[DRY-RUN] Would create $NGC_SECRET_NAME in namespaces: $OSMO_NAMESPACE, $OSMO_OPERATOR_NAMESPACE, $OSMO_WORKFLOWS_NAMESPACE"
return
fi

for namespace in "$OSMO_NAMESPACE" "$OSMO_OPERATOR_NAMESPACE" "$OSMO_WORKFLOWS_NAMESPACE"; do
local secret_yaml
secret_yaml=$(kubectl create secret docker-registry "$NGC_SECRET_NAME" \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password="$NGC_API_KEY" \
--namespace "$namespace" \
--dry-run=client -o yaml)
$RUN_KUBECTL_APPLY_STDIN "$secret_yaml"
log_info " Applied $NGC_SECRET_NAME in namespace $namespace"
done

log_success "NGC image pull secrets created"
}

###############################################################################
# Helm Functions
###############################################################################
Expand All @@ -301,7 +335,12 @@ add_helm_repos() {
if [[ "$IS_PRIVATE_CLUSTER" == "true" ]]; then
$RUN_HELM "repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo && helm repo update"
else
helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo || true
if [[ -n "$NGC_API_KEY" ]]; then
helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo \
--username='$oauthtoken' --password="$NGC_API_KEY" || true
else
helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo || true
fi
helm repo update
fi

Expand All @@ -311,12 +350,18 @@ add_helm_repos() {
create_helm_values() {
log_info "Creating OSMO Helm values files..."

local ngc_pull_secret_yaml=""
if [[ -n "$NGC_API_KEY" ]]; then
ngc_pull_secret_yaml=" imagePullSecret: ${NGC_SECRET_NAME}"
fi

# Service values
cat > "$VALUES_DIR/service_values.yaml" <<EOF
# OSMO Service Values - Auto-generated
global:
osmoImageLocation: ${OSMO_IMAGE_REGISTRY}
osmoImageTag: ${OSMO_IMAGE_TAG}
${ngc_pull_secret_yaml}

services:
configFile:
Expand Down Expand Up @@ -376,6 +421,7 @@ EOF
global:
osmoImageLocation: ${OSMO_IMAGE_REGISTRY}
osmoImageTag: ${OSMO_IMAGE_TAG}
${ngc_pull_secret_yaml}

services:
ui:
Expand All @@ -398,6 +444,7 @@ EOF
global:
osmoImageLocation: ${OSMO_IMAGE_REGISTRY}
osmoImageTag: ${OSMO_IMAGE_TAG}
${ngc_pull_secret_yaml}

services:
configFile:
Expand Down Expand Up @@ -433,6 +480,7 @@ EOF
global:
osmoImageLocation: ${OSMO_IMAGE_REGISTRY}
osmoImageTag: ${OSMO_IMAGE_TAG}
${ngc_pull_secret_yaml}
serviceUrl: http://osmo-agent.${OSMO_NAMESPACE}.svc.cluster.local
agentNamespace: ${OSMO_OPERATOR_NAMESPACE}
backendNamespace: ${OSMO_WORKFLOWS_NAMESPACE}
Expand Down Expand Up @@ -523,7 +571,7 @@ setup_backend_operator() {
else
# Port forward to OSMO service
log_info "Starting port-forward to OSMO service..."
kubectl port-forward service/osmo-service 9000:80 -n "$OSMO_NAMESPACE" &
$RUN_KUBECTL "port-forward service/osmo-service 9000:80 -n $OSMO_NAMESPACE" &
local port_forward_pid=$!
sleep 5

Expand All @@ -540,10 +588,12 @@ setup_backend_operator() {
-t json 2>/dev/null | jq -r '.token' || echo "")

if [[ -n "$backend_token" && "$backend_token" != "null" ]]; then
kubectl create secret generic osmo-operator-token \
local token_secret_yaml
token_secret_yaml=$(kubectl create secret generic osmo-operator-token \
--from-literal=token="$backend_token" \
--namespace "$OSMO_OPERATOR_NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
--dry-run=client -o yaml)
$RUN_KUBECTL_APPLY_STDIN "$token_secret_yaml"

log_success "Backend token created"
token_created=true
Expand Down Expand Up @@ -585,6 +635,10 @@ cleanup_osmo() {
$RUN_HELM "uninstall router-minimal --namespace $OSMO_NAMESPACE" 2>/dev/null || true
$RUN_HELM "uninstall osmo-operator --namespace $OSMO_OPERATOR_NAMESPACE" 2>/dev/null || true

for namespace in "$OSMO_NAMESPACE" "$OSMO_OPERATOR_NAMESPACE" "$OSMO_WORKFLOWS_NAMESPACE"; do
$RUN_KUBECTL "delete secret $NGC_SECRET_NAME --namespace $namespace --ignore-not-found=true" 2>/dev/null || true
done

$RUN_KUBECTL "delete namespace $OSMO_NAMESPACE" 2>/dev/null || true
$RUN_KUBECTL "delete namespace $OSMO_OPERATOR_NAMESPACE" 2>/dev/null || true
$RUN_KUBECTL "delete namespace $OSMO_WORKFLOWS_NAMESPACE" 2>/dev/null || true
Expand Down Expand Up @@ -667,16 +721,17 @@ deploy_k8s_main() {
add_helm_repos
create_database
create_secrets
create_image_pull_secrets
create_helm_values

deploy_osmo_service
deploy_osmo_ui
deploy_osmo_router

wait_for_pods "$OSMO_NAMESPACE" 300 "" "kubectl"
wait_for_pods "$OSMO_NAMESPACE" 300 "" "$RUN_KUBECTL"

setup_backend_operator
wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "kubectl"
wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "$RUN_KUBECTL"

verify_deployment
print_access_instructions
Expand Down
Loading
Loading