Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,14 @@ docs_tools:
# Testing Configuration
testing:
kind_node_image: 'kindest/node:v1.32.0'

# Component test harness configuration
# Used by tools/component-test/ scripts to validate individual components
component_test:
nvml_mock_version: '0.1.0'
nvml_mock_image: 'ghcr.io/nvidia/nvml-mock'
default_gpu_profile: 'a100'
default_gpu_count: 8
cluster_name: 'aicr-component-test'
helm_timeout: '300s'
health_check_timeout: '5m'
16 changes: 16 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,22 @@ This creates three files with TODOs guiding implementation:

**See [docs/contributor/validator.md](docs/contributor/validator.md) for complete guide with examples, architecture overview, and troubleshooting.**

#### Adding a Component

AICR components are declarative — add an entry to `recipes/registry.yaml` with
Helm or Kustomize settings, create a `values.yaml`, and optionally add a health
check. No Go code needed.

**Validate your component:**
```bash
make build
make component-test COMPONENT=my-component
```

This auto-detects the right test tier, creates a Kind cluster, deploys the
component, and runs its health check. See
[tools/component-test/README.md](tools/component-test/README.md) for details.

## Design Principles

These principles guide all design decisions in AICR. When faced with trade-offs, these principles take precedence.
Expand Down
42 changes: 42 additions & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,48 @@ make validate-local RECIPE=recipe.yaml IMAGE_TAG=dev
make qualify
```

## Testing a New Component

The component test harness validates that a component deploys and passes its
health check in an isolated Kind cluster. No GPU hardware required for most
components.

### Quick Start

```bash
# Build aicr, then test your component
make build
make component-test COMPONENT=cert-manager
```

The harness auto-detects the test tier (`scheduling`, `deploy`, or `gpu-aware`),
creates a Kind cluster, deploys the component, and runs its health check.

### Available Targets

```bash
make component-test COMPONENT=cert-manager # Full end-to-end test
make component-detect COMPONENT=cert-manager # Show detected tier
make component-cluster # Create/reuse cluster
make component-deploy COMPONENT=cert-manager # Deploy only
make component-health COMPONENT=cert-manager # Health check only
make component-cleanup COMPONENT=cert-manager # Uninstall component
```

### Debugging

```bash
# Keep cluster for inspection
KEEP_CLUSTER=true make component-test COMPONENT=cert-manager

# Inspect and re-run
kubectl -n cert-manager get pods
make component-health COMPONENT=cert-manager
```

See [tools/component-test/README.md](tools/component-test/README.md) for full
environment variable reference and troubleshooting.

## Validator Development

For detailed information on adding validation checks and constraint validators, see:
Expand Down
67 changes: 67 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,73 @@ endif
kwok-test-all: build ## Run all KWOK recipe tests in a shared cluster
@bash kwok/scripts/run-all-recipes.sh

# =============================================================================
# Component Testing
# =============================================================================

.PHONY: component-test
component-test: build ## Test a single component end-to-end (COMPONENT=cert-manager [TIER=deploy])
ifndef COMPONENT
@echo "Error: COMPONENT is required"
@echo "Usage: make component-test COMPONENT=cert-manager"
@echo " make component-test COMPONENT=gpu-operator TIER=gpu-aware"
@exit 1
endif
@set -e; \
TIER=$${TIER:-$$(bash tools/component-test/detect-tier.sh $(COMPONENT))}; \
echo "[INFO] Detected tier: $$TIER"; \
do_cleanup() { \
if [ "$${KEEP_CLUSTER:-false}" != "true" ]; then \
COMPONENT=$(COMPONENT) bash tools/component-test/cleanup.sh || true; \
fi; \
}; \
trap do_cleanup EXIT; \
TIER=$$TIER bash tools/component-test/ensure-cluster.sh; \
if [ "$$TIER" = "gpu-aware" ]; then \
GPU_PROFILE=$${GPU_PROFILE:-} GPU_COUNT=$${GPU_COUNT:-} bash tools/component-test/setup-gpu-mock.sh; \
fi; \
if [ "$$TIER" = "scheduling" ]; then \
echo "[INFO] Scheduling tier uses KWOK, not this harness."; \
echo "[INFO] Run: make kwok-e2e RECIPE=<recipe-name>"; \
echo "[INFO] No test was executed. Exiting with code 2."; \
exit 2; \
fi; \
COMPONENT=$(COMPONENT) HELM_NAMESPACE=$${HELM_NAMESPACE:-} bash tools/component-test/deploy-component.sh; \
COMPONENT=$(COMPONENT) bash tools/component-test/run-health-check.sh

.PHONY: component-detect
component-detect: ## Show detected test tier for a component (COMPONENT=cert-manager)
ifndef COMPONENT
@echo "Error: COMPONENT is required"
@echo "Usage: make component-detect COMPONENT=cert-manager"
@exit 1
endif
@bash tools/component-test/detect-tier.sh $(COMPONENT)

.PHONY: component-cluster
component-cluster: ## Create or reuse the component test Kind cluster
@TIER=$${TIER:-deploy} bash tools/component-test/ensure-cluster.sh

.PHONY: component-deploy
component-deploy: build ## Deploy a single component (COMPONENT=cert-manager)
ifndef COMPONENT
@echo "Error: COMPONENT is required"
@exit 1
endif
@COMPONENT=$(COMPONENT) HELM_NAMESPACE=$${HELM_NAMESPACE:-} bash tools/component-test/deploy-component.sh

.PHONY: component-health
component-health: ## Run health check for a deployed component (COMPONENT=cert-manager)
ifndef COMPONENT
@echo "Error: COMPONENT is required"
@exit 1
endif
@COMPONENT=$(COMPONENT) bash tools/component-test/run-health-check.sh

.PHONY: component-cleanup
component-cleanup: ## Clean up component test resources (COMPONENT=cert-manager [DELETE_CLUSTER=true])
@COMPONENT=$${COMPONENT:-} DELETE_CLUSTER=$${DELETE_CLUSTER:-false} KEEP_CLUSTER=$${KEEP_CLUSTER:-false} bash tools/component-test/cleanup.sh

# =============================================================================
# Combined Development Targets
# =============================================================================
Expand Down
69 changes: 69 additions & 0 deletions pkg/bundler/deployer/helm/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1378,6 +1378,75 @@ func TestGenerateDeployScript(t *testing.T) {
}
}

func TestGenerateDeployScript_EmptyVersionOmitsFlag(t *testing.T) {
g := NewGenerator()
ctx := context.Background()
dir := t.TempDir()

components := []ComponentData{
{
Name: "gpu-operator",
Namespace: "gpu-operator",
Repository: "https://helm.ngc.nvidia.com/nvidia",
ChartName: "gpu-operator",
Version: "", // empty version — should not produce --version flag
HasChart: true,
},
}

input := &GeneratorInput{Version: "v1.0.0"}
path, _, err := g.generateDeployScript(ctx, input, components, dir)
if err != nil {
t.Fatalf("generateDeployScript failed: %v", err)
}

content, err := os.ReadFile(path)
if err != nil {
t.Fatalf("reading deploy.sh: %v", err)
}

script := string(content)
if strings.Contains(script, "--version") {
t.Errorf("deploy.sh should not contain --version when Version is empty, got:\n%s", script)
}
if !strings.Contains(script, "helm upgrade --install gpu-operator gpu-operator") {
t.Errorf("deploy.sh should contain helm install command for gpu-operator")
}
}

func TestGenerateDeployScript_WithVersionIncludesFlag(t *testing.T) {
g := NewGenerator()
ctx := context.Background()
dir := t.TempDir()

components := []ComponentData{
{
Name: "cert-manager",
Namespace: "cert-manager",
Repository: "https://charts.jetstack.io",
ChartName: "cert-manager",
Version: "v1.17.2",
HasChart: true,
},
}

input := &GeneratorInput{Version: "v1.0.0"}
path, _, err := g.generateDeployScript(ctx, input, components, dir)
if err != nil {
t.Fatalf("generateDeployScript failed: %v", err)
}

content, err := os.ReadFile(path)
if err != nil {
t.Fatalf("reading deploy.sh: %v", err)
}

script := string(content)
if !strings.Contains(script, "--version v1.17.2") {
t.Errorf("deploy.sh should contain --version v1.17.2, got:\n%s", script)
}
}

func TestGenerateUndeployScript(t *testing.T) {
tests := []struct {
name string
Expand Down
6 changes: 4 additions & 2 deletions pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ fi
{{ if .IsOCI -}}
retry "{{ .Name }} helm install" \
helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
--version {{ .Version }} \
{{ if .Version }}--version {{ .Version }} \
{{ end -}}
-n {{ .Namespace }} --create-namespace \
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
${COMPONENT_WAIT_ARGS} \
Expand All @@ -291,7 +292,8 @@ retry "{{ .Name }} helm install" \
retry "{{ .Name }} helm install" \
helm upgrade --install {{ .Name }} {{ .ChartName }} \
--repo {{ .Repository }} \
--version {{ .Version }} \
{{ if .Version }}--version {{ .Version }} \
{{ end -}}
-n {{ .Namespace }} --create-namespace \
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
${COMPONENT_WAIT_ARGS} \
Expand Down
2 changes: 2 additions & 0 deletions recipes/registry.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
# defaultSource: Git repository or OCI reference
# defaultPath: Path within the repository to the kustomization
# defaultTag: Git tag, branch, or commit
# testTier: Optional override for component test tier detection
# (scheduling, deploy, or gpu-aware). Used by tools/component-test/.
# nodeScheduling: Paths in Helm values where node selectors/tolerations are injected
#
# Note: A component must have either 'helm' OR 'kustomize' configuration, not both.
Expand Down
Loading
Loading