diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 0dc708981..2188db328 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -3,3 +3,6 @@ reviews: profile: chill # assertive / chill profile auto_review: enabled: true # Enable auto-review for this repository + path_instructions: + - path: "src/**" + instructions: "If this PR adds, removes, or renames a service, module, or major component, check that AGENTS.md is updated accordingly." diff --git a/AGENTS.md b/AGENTS.md index acc1bcf81..521a3dece 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,6 +6,22 @@ This file provides guidance to AI agents when working with the OSMO codebase. OSMO is a workflow orchestration platform for Physical AI, managing heterogeneous Kubernetes clusters for training, simulation, and edge compute workloads. +## Workflow Requirements + +Before making any code changes in this repo, you MUST: + +1. **Explore first**: Use the Codebase Structure section below to orient yourself, then read relevant source files before proposing changes. Read existing implementations, tests, and related modules. Never modify code you haven't read. +2. **Plan before implementing**: For any non-trivial change (more than a simple one-line fix), create an explicit plan that identifies: + - Which files need to change and why + - How the change fits with existing patterns in the codebase + - What tests exist and what new tests are needed + - Any cross-cutting concerns (e.g., auth, storage backends, IPC protocols) + - A verification plan: how to confirm the change works (e.g., specific tests to run, build commands, manual checks) +3. **Check for downstream impact**: This is a multi-service platform — changes in shared libraries (`lib/`, `utils/`) can affect multiple services. Grep for usages before modifying shared code. +4. **Verify after implementation**: After completing changes, execute the verification plan — run the relevant tests/builds and confirm they pass before claiming the work is done. Never assert success without evidence. +5. **Simplify before committing**: Review your changes for unnecessary complexity, redundancy, and over-engineering before committing. Prefer the simplest solution that meets the requirements. +6. **Update documentation**: If adding, removing, or renaming a service, module, or major component, update the "Codebase Structure" section in this file as part of the same change. + ## Team Guidelines - Follow existing code patterns and conventions in the codebase @@ -15,15 +31,10 @@ OSMO is a workflow orchestration platform for Physical AI, managing heterogeneou - Copyright headers must keep "All rights reserved." on the same line as "NVIDIA CORPORATION & AFFILIATES" - If copyright lines exceed 100 characters, add `# pylint: disable=line-too-long` comment instead of breaking into multiple lines -## Tool Usage Preferences - -- Use specialized tools (Read, Edit, Write, Grep, Glob) instead of Bash commands whenever possible -- Bash tools require user intervention to allow and should only be used as a last resort -- Prefer Read over cat, Edit over sed, Write over echo/heredoc, Grep over grep, and Glob over find - -## Coding Standards +## Python Coding Standards ### Import Statements + - All imports must be at the top level of the module - Place all imports at the top of the file after the module docstring - **No exceptions**: Imports inside functions are not allowed @@ -35,12 +46,14 @@ OSMO is a workflow orchestration platform for Physical AI, managing heterogeneou - Use late binding or forward references for type hints (PEP 563) ### Variable Naming + - Do not use abbreviations in variable names unless they are well-understood abbreviations or common conventions - **Good**: `topology_key`, `config`, `i` (iterator), `x`, `y`, `z` (coordinates) - **Bad**: `tk` (for topology_key), `topo` (for topology), `req` (for requirement) - Use full, descriptive names that make code self-documenting ### Type Annotations and Data Structures + - **Use strict typing**: Add type annotations where they improve code clarity and catch errors - **Prefer dataclasses over dictionaries**: When passing structured data with multiple fields, use dataclasses instead of `Dict[str, Any]` - **Good**: `@dataclasses.dataclass class TaskTopology: name: str; requirements: List[...]` @@ -55,8 +68,178 @@ OSMO is a workflow orchestration platform for Physical AI, managing heterogeneou - **Bad**: `def process(items: List[str] = []) -> None:` - all callers share the same list instance! ### Assertions + - **Do not use `assert` statements in production code** - only in unit tests - **Reason**: Assertions can be disabled with Python's `-O` flag and should not be relied upon for runtime validation - **Use proper error handling instead**: Raise appropriate exceptions (ValueError, TypeError, etc.) for validation - **Good**: `if value is None: raise ValueError("Value cannot be None")` - **Bad**: `assert value is not None, "Value cannot be None"` + +## Codebase Structure (`src/`) + +All paths below are relative to `src/`. + +### Core Service (`service/core/`) — Main FastAPI Microservice + +Entry point: `service/core/service.py`. Framework: FastAPI + Uvicorn + OpenTelemetry. + +| Submodule | Purpose | +|-----------|---------| +| `auth/` | JWT token lifecycle, access token CRUD, user management, role assignment | +| `workflow/` | Workflow submit/list/cancel, resource quota, pool allocation, task coordination, credential management | +| `config/` | Service/workflow/dataset configuration CRUD with versioning and history. Pod templates, resource validation rules, pool/backend config | +| `data/` | Dataset/collection management, versioning with tags, multi-backend storage, streaming downloads | +| `app/` | Workflow app lifecycle (create, version, rename, delete), YAML spec validation | +| `profile/` | User profile/preferences, token identity, role/pool visibility | + +**Error types**: Defined in `lib/utils/` — see the `OSMOError` hierarchy for the full list. + +### Supporting Services + +| Service | Purpose | +|---------|---------| +| `service/router/` | Routes HTTP/WebSocket requests to backends. Sticky session routing. WebSocket endpoints for exec, portforward, rsync. | +| `service/worker/` | Kombu-based Redis job queue consumer. Deduplicates jobs. Executes `FrontendJob` subclasses. | +| `service/agent/` | Backend cluster integration via WebSocket. Receives node/pod/event/heartbeat streams from K8s clusters. | +| `service/logger/` | Receives structured logs from osmo-ctrl containers. Persists task metrics to PostgreSQL. Distributed barriers via Redis. | +| `service/delayed_job_monitor/` | Polls Redis for scheduled jobs, promotes to main queue when ready. | + +### Python Libraries (`lib/`) + +| Library | Key Classes | Purpose | +|---------|-------------|---------| +| `lib/data/storage/` | `Client`, `StorageBackend`, `ExecutorParameters`, `StoragePath` | Multi-cloud storage SDK (S3, Azure, GCS, Swift, TOS). Parallel multiprocess+multithread executor. Streaming upload/download. | +| `lib/data/dataset/` | `Manager` | Dataset lifecycle (upload, download, migrate) built on storage SDK. | +| `lib/utils/` | `LoginManager`, `ServiceClient`, `OSMOError` hierarchy | Client SDK for HTTP/WebSocket requests with JWT auth. Error types, logging, validation, credential management. | +| `lib/rsync/` | `RsyncClient` | File watch-based rsync with debounce/reconciliation. Port forwarding for remote access. | + +### Python Utilities (`utils/`) + +| Module | Key Classes | Purpose | +|--------|-------------|---------| +| `utils/job/` | `Task`, `FrontendJob`, `K8sObjectFactory`, `PodGroupTopologyBuilder` | Workflow execution framework. Task → K8s spec generation. Gang scheduling via PodGroup. Topology constraints. Backend job definitions. | +| `utils/connectors/` | `ClusterConnector`, `PostgresConnector`, `RedisConnector` | K8s API wrapper, PostgreSQL operations, Redis job queue management. | +| `utils/secret_manager/` | `SecretManager` | JWE-based secret encryption/decryption. MEK/UEK key management. | +| `utils/progress_check/` | — | Liveness/progress tracking for long-running services. | +| `utils/metrics/` | — | Prometheus metrics collection and export. | + +### CLI (`cli/`) + +Entry point: `cli.py` → `main_parser.py` (argparse). Subcommand modules: + + +| Module | Commands | +| -------------------------------------------------------------------------------------------------------------- | -------------------------------- | +| `workflow.py` | submit, list, cancel, exec, logs | +| `data.py` | upload, download, list, delete | +| `dataset.py` | Dataset management | +| `app.py` | App submission/management | +| `config.py` | Service configuration | +| `profile.py` | User profiles | +| `login.py` | Authentication | +| `pool.py`, `resources.py`, `user.py`, `credential.py`, `access_token.py`, `bucket.py`, `task.py`, `version.py` | Supporting commands | +| `backend.py` | Backend cluster management | + +Features: Tab completion (shtab), response formatting (`formatters.py`), spec editor (`editor.py`), PyInstaller packaging (`cli_builder.py`, `packaging/`). + +### Go Runtime Containers (`runtime/`) + +| Binary | Purpose | +|--------|---------| +| `runtime/cmd/ctrl/` | **osmo_ctrl** — Orchestrates workflow execution. WebSocket to workflow service. Unix socket to osmo_user. Manages data download/upload, barriers for multi-task sync, port forwarding. | +| `runtime/cmd/user/` | **osmo_user** — Executes user commands with PTY. Streams stdout/stderr to ctrl. Handles checkpointing (periodic uploads). | +| `runtime/cmd/rsync/` | **osmo_rsync** — Rsync daemon with bandwidth limiting. | + +### Go Runtime Packages (`runtime/pkg/`) + +| Package | Purpose | +|---------|---------| +| `args/` | CLI flag parsing for ctrl and user containers. | +| `messages/` | IPC message protocol between containers (exec lifecycle, log streaming, barriers). | +| `common/` | Shared utilities: command execution, file operations, circular buffer. | +| `data/` | Input/output data handling. Storage backend abstraction (S3, Swift, GCS, TOS). Mount/download/upload with retry and checkpointing. | +| `metrics/` | Execution timing and data transfer metrics collection. | +| `osmo_errors/` | Error handling with categorized exit codes and termination logging. | +| `rsync/` | Rsync daemon subprocess management with monitoring. | + +### Go Utilities (`utils/` — Go) + +| Package | Purpose | +|---------|---------| +| `roles/` | Semantic RBAC. Actions like `workflow:Create`, `dataset:Read`. LRU cache with TTL. Role sync from IDP. Pool access evaluation. | +| `postgres/` | PostgreSQL client with pgx connection pool and pgroll schema version support. | +| `redis/` | Redis client with optional TLS. | +| `logging/` | Structured slog handler compatible with Fluent Bit parsers. | +| `env.go` | Environment variable helpers with YAML config file fallback. | + +### Authorization Sidecar (`service/authz_sidecar/`) — Go gRPC + +- Implements external authorization for the API gateway +- Flow: Extract user/roles from request headers → sync roles from IDP → resolve role policies from cache/DB → evaluate semantic RBAC → return allow/deny with `x-osmo-user`, `x-osmo-roles`, `x-osmo-allowed-pools` headers + +### Frontend (`ui/`) + +- **Framework**: Next.js (App Router, Turbopack) + React + TypeScript (see `package.json` for versions) +- **Styling**: Tailwind CSS + shadcn/ui +- **State**: TanStack Query (data fetching), Zustand (UI state), nuqs (URL state) +- **Testing**: Vitest (unit), Playwright (E2E), MSW (API mocking) +- **API layer**: OpenAPI-generated types (`lib/api/generated.ts` — DO NOT EDIT) + adapter layer (`lib/api/adapter/`) that bridges backend quirks to UI expectations +- **Key routes**: pools, resources, workflows, datasets, occupancy, profile, log-viewer (under `app/(dashboard)/`) +- **Import rules**: Absolute imports only (`@/...`), no barrel exports, API types from adapter (not generated) + +### Operator (`operator/`) + +- `backend_listener.py` — WebSocket listener for backend cluster status +- `backend_worker.py` — Job execution engine for backend tasks +- `backend_test_runner/` — Test orchestration for backend validation +- `utils/node_validation_test/` — GPU validation (nvidia-smi, tflops benchmark, stuck pod detection) +### Tests + + +| Location | Framework | Scope | +| ------------------------------------------- | ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tests/common/` | pytest + testcontainers | Shared fixtures: PostgreSQL (`database/`), S3/Swift/Redis storage (`storage/`), Docker network/registry (`core/`, `registry/`), Envoy TLS proxy (`envoy/`) | +| `tests/common/database/testdata/schema.sql` | — | Database schema definition (source of truth for test DB) | +| `runtime/pkg/*_test.go` | Go testing + testcontainers-go | Runtime package unit/integration tests | +| `utils/*_test.go` | Go testing | Go utility tests (roles, postgres, redis) | +| `ui/src/**/*.test.ts` | Vitest | Frontend unit tests | +| `ui/e2e/` | Playwright | Frontend E2E tests with page object models | + +### Key Architecture Patterns + +- **Container runtime**: Three container types per workflow — ctrl (orchestrator), user (execution), data (rsync sidecar) +- **IPC**: WebSocket (ctrl↔workflow service), Unix sockets (ctrl↔user), gRPC (authz sidecar) +- **Auth**: API gateway → authz_sidecar (semantic RBAC with `x-osmo-user`, `x-osmo-roles`, `x-osmo-allowed-pools` headers) +- **Storage**: Multi-cloud abstraction (S3/Azure/GCS/Swift/TOS) with parallel multiprocess+multithread transfer +- **Job queue**: Redis-backed Kombu queue with deduplication. Delayed jobs via Redis ZSET. +- **Databases**: PostgreSQL (pgx), Redis (caching + job queue + event streams + barriers), pgroll for schema versioning +- **Monitoring**: Prometheus + Grafana + Loki. OpenTelemetry instrumentation on FastAPI. + +### Inter-Service Communication + +``` +Client → API Gateway → authz_sidecar (gRPC Check) → Core Service (FastAPI) + ├── PostgreSQL (state) + ├── Redis (cache, job queue, events) + ├── → Worker (job consumer) + ├── ↔ Agent (WebSocket backend events) + ├── ↔ Logger (WebSocket log streaming) + ├── → Router (HTTP/WS request routing) + └── → Delayed Job Monitor (scheduled jobs) + +Workflow Execution: + Core Service → K8s Backend → [osmo_ctrl ↔ osmo_user ↔ osmo_rsync] + osmo_ctrl ↔ Core Service (WebSocket) + osmo_ctrl → Logger (WebSocket logs/metrics) +``` + +### Build & Test + +- **Build system**: Bazel (`MODULE.bazel`, `.bazelrc`) — check `MODULE.bazel` for current version +- **Python**: ruff linter (`.ruff.toml`, Google style) — check `MODULE.bazel` for Python version +- **Go**: module `go.corp.nvidia.com/osmo` (single `go.mod` at `src/`) — check `go.mod` for Go version +- **Frontend**: Next.js + pnpm, TypeScript strict mode, ESLint + Prettier — check `ui/package.json` for versions +- **Tests**: Bazel test rules, pytest + testcontainers (Python), testcontainers-go (Go), Vitest + Playwright (frontend) +- **Container images**: Built via `rules_oci` (amd64, arm64), distroless base from NVIDIA NGC +- **API spec**: OpenAPI auto-generated from FastAPI via `bazel run //src/scripts:export_openapi` + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..9319e0556 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,7 @@ +@AGENTS.md + +## Tool Usage Preferences + +- Use specialized tools (Read, Edit, Write, Grep, Glob) instead of Bash commands whenever possible +- Bash tools require user intervention to allow and should only be used as a last resort +- Prefer Read over cat, Edit over sed, Write over echo/heredoc, Grep over grep, and Glob over find diff --git a/README.md b/README.md index fac48bfc2..7f636d66b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ Select one of the deployment options below depending on your needs and environme +**Deploying on Microsoft Azure?** Get started with [Azure NVIDIA Reference Architecture](https://github.com/microsoft/physical-ai-toolchain). This jointly published architecture delivers a production-ready Physical AI pipeline on Microsoft Azure, integrating OSMO, Isaac Lab, and Isaac Sim with GPU-accelerated RL training, auto-scaling compute, and enterprise-grade Kubernetes security. ## Documentation diff --git a/bzl/py.bzl b/bzl/py.bzl index 05cbf1cb7..e5371c434 100644 --- a/bzl/py.bzl +++ b/bzl/py.bzl @@ -139,10 +139,14 @@ import glob pythonpath = ["{runfiles_dir}/_main"] local_runfiles_dir = "{runfiles_dir}" local_main_dir = "{runfiles_dir}/_main" + if os.path.isdir("/osmo_workspace+"): local_runfiles_dir = "/osmo_workspace+" + local_runfiles_dir local_main_dir = local_runfiles_dir + "/osmo_workspace+" - pythonpath.append(local_runfiles_dir + "/osmo_workspace+") + +osmo_ws_path = local_runfiles_dir + "/osmo_workspace+" +if os.path.isdir(osmo_ws_path): + pythonpath.append(osmo_ws_path) # Add all site-packages directories site_packages = glob.glob(local_runfiles_dir + "/rules_python++pip+*/site-packages") diff --git a/deployments/charts/router/templates/_sidecar-helpers.tpl b/deployments/charts/router/templates/_sidecar-helpers.tpl index e7982b8fc..b39b99c39 100644 --- a/deployments/charts/router/templates/_sidecar-helpers.tpl +++ b/deployments/charts/router/templates/_sidecar-helpers.tpl @@ -119,7 +119,7 @@ OAuth2 Proxy sidecar container ports: - name: http containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }} - - name: metrics + - name: oauth2-metrics containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }} livenessProbe: httpGet: diff --git a/deployments/charts/service/templates/_sidecar-helpers.tpl b/deployments/charts/service/templates/_sidecar-helpers.tpl index 8fa9058ef..809293cc9 100644 --- a/deployments/charts/service/templates/_sidecar-helpers.tpl +++ b/deployments/charts/service/templates/_sidecar-helpers.tpl @@ -245,7 +245,7 @@ OAuth2 Proxy sidecar container ports: - name: http containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }} - - name: metrics + - name: oauth2-metrics containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }} livenessProbe: httpGet: diff --git a/deployments/charts/service/templates/api-monitor.yaml b/deployments/charts/service/templates/api-monitor.yaml index 8a09218dd..a4f906a16 100644 --- a/deployments/charts/service/templates/api-monitor.yaml +++ b/deployments/charts/service/templates/api-monitor.yaml @@ -28,6 +28,11 @@ spec: port: envoy-admin path: /stats/prometheus {{- end }} + {{- if .Values.sidecars.oauth2Proxy.enabled }} + - interval: 15s + port: oauth2-metrics + path: /metrics + {{- end }} selector: matchExpressions: - { key: app, diff --git a/deployments/charts/web-ui/templates/_sidecar-helpers.tpl b/deployments/charts/web-ui/templates/_sidecar-helpers.tpl index fedeefc9c..10dd50c27 100644 --- a/deployments/charts/web-ui/templates/_sidecar-helpers.tpl +++ b/deployments/charts/web-ui/templates/_sidecar-helpers.tpl @@ -169,7 +169,7 @@ OAuth2 Proxy sidecar container ports: - name: http containerPort: {{ .Values.sidecars.oauth2Proxy.httpPort }} - - name: metrics + - name: oauth2-metrics containerPort: {{ .Values.sidecars.oauth2Proxy.metricsPort }} livenessProbe: httpGet: diff --git a/deployments/scripts/README.md b/deployments/scripts/README.md index a70c9b366..49e87dbc1 100644 --- a/deployments/scripts/README.md +++ b/deployments/scripts/README.md @@ -74,6 +74,7 @@ The main entry point for deploying OSMO. This script orchestrates: | `--destroy` | Destroy all resources | | `--dry-run` | Show what would be done without making changes | | `--non-interactive` | Fail if required parameters are missing (for CI/CD) | +| `--ngc-api-key` | NGC API key for pulling images and Helm charts from `nvcr.io` (optional) | | `-h, --help` | Show help message | #### Azure-specific Options @@ -221,11 +222,39 @@ The script will prompt for: --aws-region "us-west-2" \ --cluster-name "osmo-aws" \ --postgres-password "SecurePass123!" \ + --redis-password "SecureRedisToken123!" \ --non-interactive ``` > **Note:** Keep cluster names short (≤12 characters) to avoid AWS IAM role name length limits. +### Deployment with NGC Registry Credentials + +Required when pulling OSMO images and Helm charts from a private registry in `nvcr.io`. + +```bash +# Via flag +./deploy-osmo-minimal.sh --provider aws \ + --aws-region "us-west-2" \ + --cluster-name "osmo-aws" \ + --postgres-password "SecurePass123!" \ + --redis-password "SecureRedisToken123!" \ + --ngc-api-key "$NGC_API_KEY" + +# Via environment variable +export NGC_API_KEY="your-ngc-api-key" +./deploy-osmo-minimal.sh --provider aws \ + --aws-region "us-west-2" \ + --cluster-name "osmo-aws" \ + --postgres-password "SecurePass123!" \ + --redis-password "SecureRedisToken123!" +``` + +When an NGC API key is provided, the script: +1. Authenticates `helm repo add` with `--username='$oauthtoken' --password=` +2. Creates a `nvcr-secret` docker-registry secret in all three namespaces +3. Configures all Helm charts to use `nvcr-secret` as the image pull secret + ## Environment Variables | Variable | Description | Default | @@ -233,6 +262,7 @@ The script will prompt for: | `OSMO_IMAGE_REGISTRY` | OSMO Docker image registry | `nvcr.io/nvidia/osmo` | | `OSMO_IMAGE_TAG` | OSMO Docker image tag | `latest` | | `BACKEND_TOKEN_EXPIRY` | Backend operator token expiry | `2027-01-01` | +| `NGC_API_KEY` | NGC API key for `nvcr.io` image and Helm chart pulls | - | | `TF_SUBSCRIPTION_ID` | Azure subscription ID | - | | `TF_RESOURCE_GROUP` | Azure resource group | - | | `TF_POSTGRES_PASSWORD` | PostgreSQL password | - | diff --git a/deployments/scripts/aws/terraform.sh b/deployments/scripts/aws/terraform.sh index 3a3c68e2d..9f46e4345 100755 --- a/deployments/scripts/aws/terraform.sh +++ b/deployments/scripts/aws/terraform.sh @@ -204,7 +204,7 @@ node_group_max_size = 5 node_group_desired_size = 3 # RDS Configuration -rds_engine_version = "15.4" +rds_engine_version = "15.12" rds_instance_class = "db.t3.medium" rds_db_name = "osmo" rds_username = "postgres" diff --git a/deployments/scripts/deploy-k8s.sh b/deployments/scripts/deploy-k8s.sh index c5b71b074..0c52db26b 100755 --- a/deployments/scripts/deploy-k8s.sh +++ b/deployments/scripts/deploy-k8s.sh @@ -53,6 +53,8 @@ OSMO_WORKFLOWS_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}" OSMO_IMAGE_REGISTRY="${OSMO_IMAGE_REGISTRY:-nvcr.io/nvidia/osmo}" OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-latest}" BACKEND_TOKEN_EXPIRY="${BACKEND_TOKEN_EXPIRY:-2027-01-01}" +NGC_API_KEY="${NGC_API_KEY:-}" +NGC_SECRET_NAME="nvcr-secret" # Provider-specific settings (set by loading provider script) PROVIDER="" @@ -94,6 +96,10 @@ parse_k8s_args() { DRY_RUN=true shift ;; + --ngc-api-key) + NGC_API_KEY="$2" + shift 2 + ;; *) shift ;; @@ -286,6 +292,34 @@ data: log_success "Secrets created" } +create_image_pull_secrets() { + if [[ -z "$NGC_API_KEY" ]]; then + log_info "NGC_API_KEY not set, skipping image pull secret creation" + return + fi + + log_info "Creating NGC image pull secrets..." + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY-RUN] Would create $NGC_SECRET_NAME in namespaces: $OSMO_NAMESPACE, $OSMO_OPERATOR_NAMESPACE, $OSMO_WORKFLOWS_NAMESPACE" + return + fi + + for namespace in "$OSMO_NAMESPACE" "$OSMO_OPERATOR_NAMESPACE" "$OSMO_WORKFLOWS_NAMESPACE"; do + local secret_yaml + secret_yaml=$(kubectl create secret docker-registry "$NGC_SECRET_NAME" \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password="$NGC_API_KEY" \ + --namespace "$namespace" \ + --dry-run=client -o yaml) + $RUN_KUBECTL_APPLY_STDIN "$secret_yaml" + log_info " Applied $NGC_SECRET_NAME in namespace $namespace" + done + + log_success "NGC image pull secrets created" +} + ############################################################################### # Helm Functions ############################################################################### @@ -301,7 +335,12 @@ add_helm_repos() { if [[ "$IS_PRIVATE_CLUSTER" == "true" ]]; then $RUN_HELM "repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo && helm repo update" else - helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo || true + if [[ -n "$NGC_API_KEY" ]]; then + helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo \ + --username='$oauthtoken' --password="$NGC_API_KEY" || true + else + helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo || true + fi helm repo update fi @@ -311,12 +350,18 @@ add_helm_repos() { create_helm_values() { log_info "Creating OSMO Helm values files..." + local ngc_pull_secret_yaml="" + if [[ -n "$NGC_API_KEY" ]]; then + ngc_pull_secret_yaml=" imagePullSecret: ${NGC_SECRET_NAME}" + fi + # Service values cat > "$VALUES_DIR/service_values.yaml" </dev/null | jq -r '.token' || echo "") if [[ -n "$backend_token" && "$backend_token" != "null" ]]; then - kubectl create secret generic osmo-operator-token \ + local token_secret_yaml + token_secret_yaml=$(kubectl create secret generic osmo-operator-token \ --from-literal=token="$backend_token" \ --namespace "$OSMO_OPERATOR_NAMESPACE" \ - --dry-run=client -o yaml | kubectl apply -f - + --dry-run=client -o yaml) + $RUN_KUBECTL_APPLY_STDIN "$token_secret_yaml" log_success "Backend token created" token_created=true @@ -585,6 +635,10 @@ cleanup_osmo() { $RUN_HELM "uninstall router-minimal --namespace $OSMO_NAMESPACE" 2>/dev/null || true $RUN_HELM "uninstall osmo-operator --namespace $OSMO_OPERATOR_NAMESPACE" 2>/dev/null || true + for namespace in "$OSMO_NAMESPACE" "$OSMO_OPERATOR_NAMESPACE" "$OSMO_WORKFLOWS_NAMESPACE"; do + $RUN_KUBECTL "delete secret $NGC_SECRET_NAME --namespace $namespace --ignore-not-found=true" 2>/dev/null || true + done + $RUN_KUBECTL "delete namespace $OSMO_NAMESPACE" 2>/dev/null || true $RUN_KUBECTL "delete namespace $OSMO_OPERATOR_NAMESPACE" 2>/dev/null || true $RUN_KUBECTL "delete namespace $OSMO_WORKFLOWS_NAMESPACE" 2>/dev/null || true @@ -667,16 +721,17 @@ deploy_k8s_main() { add_helm_repos create_database create_secrets + create_image_pull_secrets create_helm_values deploy_osmo_service deploy_osmo_ui deploy_osmo_router - wait_for_pods "$OSMO_NAMESPACE" 300 "" "kubectl" + wait_for_pods "$OSMO_NAMESPACE" 300 "" "$RUN_KUBECTL" setup_backend_operator - wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "kubectl" + wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "$RUN_KUBECTL" verify_deployment print_access_instructions diff --git a/deployments/scripts/deploy-osmo-minimal.sh b/deployments/scripts/deploy-osmo-minimal.sh index ea6a228f1..acb788d94 100755 --- a/deployments/scripts/deploy-osmo-minimal.sh +++ b/deployments/scripts/deploy-osmo-minimal.sh @@ -76,6 +76,9 @@ SKIP_OSMO=false DESTROY=false DRY_RUN=false NON_INTERACTIVE=false +NGC_API_KEY="${NGC_API_KEY:-}" +TF_POSTGRES_PASSWORD="${TF_POSTGRES_PASSWORD:-}" +TF_REDIS_PASSWORD="${TF_REDIS_PASSWORD:-}" # Output files OUTPUTS_FILE="" @@ -103,6 +106,7 @@ General Options: --destroy Destroy all resources --dry-run Show what would be done without making changes --non-interactive Fail if required parameters are missing (for CI/CD) + --ngc-api-key KEY NGC API key for pulling images and Helm charts from nvcr.io -h, --help Show this help message Azure-specific Options: @@ -125,6 +129,7 @@ Environment Variables: OSMO_IMAGE_REGISTRY OSMO image registry (default: nvcr.io/nvidia/osmo) OSMO_IMAGE_TAG OSMO image tag (default: latest) BACKEND_TOKEN_EXPIRY Backend token expiry date (default: 2027-01-01) + NGC_API_KEY NGC API key (alternative to --ngc-api-key flag) Examples: # Interactive Azure deployment @@ -185,14 +190,33 @@ while [[ $# -gt 0 ]]; do NON_INTERACTIVE=true shift ;; + --ngc-api-key) + NGC_API_KEY="$2"; shift 2 ;; -h|--help) show_help exit 0 ;; - # Pass through other arguments (handled by provider scripts) - --subscription-id|--resource-group|--postgres-password|--cluster-name|--region|--environment|--k8s-version|--aws-region|--aws-profile) - shift 2 - ;; + # Provider-specific arguments - set TF_ variables used by provider scripts + --subscription-id) + TF_SUBSCRIPTION_ID="$2"; shift 2 ;; + --resource-group) + TF_RESOURCE_GROUP="$2"; shift 2 ;; + --postgres-password) + TF_POSTGRES_PASSWORD="$2"; shift 2 ;; + --redis-password) + TF_REDIS_PASSWORD="$2"; shift 2 ;; + --cluster-name) + TF_CLUSTER_NAME="$2"; shift 2 ;; + --region) + TF_REGION="$2"; shift 2 ;; + --aws-region) + TF_AWS_REGION="$2"; shift 2 ;; + --aws-profile) + TF_AWS_PROFILE="$2"; shift 2 ;; + --environment) + TF_ENVIRONMENT="$2"; shift 2 ;; + --k8s-version) + TF_K8S_VERSION="$2"; shift 2 ;; *) shift ;; @@ -344,7 +368,35 @@ verify_provider_config() { handle_configuration() { local tfvars_file="$TERRAFORM_DIR/terraform.tfvars" - if [[ ! -f "$tfvars_file" ]]; then + # Determine whether all required values were supplied via flags/env vars. + # If so, always regenerate tfvars so that flag values (e.g. a new postgres + # version or region) are never silently overridden by a stale file. + local has_all_required=false + case "$PROVIDER" in + azure) + if [[ -n "$TF_POSTGRES_PASSWORD" ]]; then + has_all_required=true + fi + ;; + aws) + if [[ -n "$TF_POSTGRES_PASSWORD" && -n "$TF_REDIS_PASSWORD" ]]; then + has_all_required=true + fi + ;; + esac + + if [[ "$has_all_required" == true ]]; then + # All required values provided — regenerate so flags always win. + # Skip interactive configuration entirely; go straight to tfvars generation. + case "$PROVIDER" in + azure) + azure_generate_tfvars "$tfvars_file" + ;; + aws) + aws_generate_tfvars "$tfvars_file" + ;; + esac + elif [[ ! -f "$tfvars_file" ]]; then log_info "terraform.tfvars not found." if [[ "$NON_INTERACTIVE" == true ]]; then @@ -418,16 +470,17 @@ deploy_osmo() { add_helm_repos create_database create_secrets + create_image_pull_secrets create_helm_values deploy_osmo_service deploy_osmo_ui deploy_osmo_router - wait_for_pods "$OSMO_NAMESPACE" 300 "" "kubectl" + wait_for_pods "$OSMO_NAMESPACE" 300 "" "$RUN_KUBECTL" setup_backend_operator - wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "kubectl" + wait_for_pods "$OSMO_OPERATOR_NAMESPACE" 180 "" "$RUN_KUBECTL" verify_deployment print_access_instructions @@ -503,6 +556,12 @@ main() { run_terraform_apply fi + # Dry-run ends after the plan - no state exists to query + if [[ "$DRY_RUN" == true ]]; then + log_success "Dry-run complete. No resources were created." + exit 0 + fi + # Get Terraform outputs get_terraform_outputs diff --git a/docs/deployment_guide/appendix/authentication/authentication_flow.rst b/docs/deployment_guide/appendix/authentication/authentication_flow.rst index 37a4360c0..533bb7587 100644 --- a/docs/deployment_guide/appendix/authentication/authentication_flow.rst +++ b/docs/deployment_guide/appendix/authentication/authentication_flow.rst @@ -132,7 +132,7 @@ Troubleshooting Verify IdP configuration (redirect URIs, client ID/secret), Envoy OAuth2 and JWT provider settings (issuer, audience, JWKS URI), and that the IdP is reachable from the cluster. **User has no permissions (403)** -Check that the user has roles in OSMO (via ``osmo user roles list `` or IdP mapping). Verify ``x-osmo-user`` and ``x-osmo-roles`` in Envoy logs. Ensure the role has policies that allow the requested action (see :doc:`roles_policies`). +Check that the user has roles in OSMO (via ``osmo user get `` or IdP mapping). Verify ``x-osmo-user`` and ``x-osmo-roles`` in Envoy logs. Ensure the role has policies that allow the requested action (see :doc:`roles_policies`). **Token validation failures** Ensure issuer and audience in Envoy match the JWT. Check JWKS URI connectivity from Envoy. For access tokens, ensure the token exists and is not expired. diff --git a/docs/deployment_guide/appendix/authentication/identity_provider_setup.rst b/docs/deployment_guide/appendix/authentication/identity_provider_setup.rst index 0838bb837..f121d4646 100644 --- a/docs/deployment_guide/appendix/authentication/identity_provider_setup.rst +++ b/docs/deployment_guide/appendix/authentication/identity_provider_setup.rst @@ -187,14 +187,14 @@ Verification ============ - **Browser:** Open ``https://`` in a private window. You should be redirected to the IdP, then back to OSMO with a session. -- **CLI:** After logging in, run ``osmo user roles list `` to confirm the user has the expected roles. +- **CLI:** After logging in, run ``osmo user get `` to confirm the user has the expected roles. Troubleshooting =============== - **Invalid token / 401:** Check issuer and audience in Envoy match the JWT. Ensure the IdP’s JWKS URI is reachable from the cluster and the signing key is present. - **Redirect fails:** Ensure the redirect URI in the IdP exactly matches (scheme, host, path, no trailing slash). -- **User has no roles / 403:** Ensure the user exists in OSMO and has roles (via ``osmo user roles list `` or IdP mapping). Verify the user claim (e.g. ``preferred_username``, ``email``) matches what OSMO expects. +- **User has no roles / 403:** Ensure the user exists in OSMO and has roles (via ``osmo user get `` or IdP mapping). Verify the user claim (e.g. ``preferred_username``, ``email``) matches what OSMO expects. .. seealso:: diff --git a/docs/deployment_guide/appendix/authentication/roles_policies.rst b/docs/deployment_guide/appendix/authentication/roles_policies.rst index 1fc35541e..7fdd9e6ec 100644 --- a/docs/deployment_guide/appendix/authentication/roles_policies.rst +++ b/docs/deployment_guide/appendix/authentication/roles_policies.rst @@ -481,7 +481,7 @@ Pool Access Issues -------------------- 1. **Check role policies**: Ensure the role has a policy allowing ``workflow:Create`` scoped to the target pool (e.g., ``resources: ["pool/my-pool"]``) -2. **Check role assignment**: Ensure the user has the role in OSMO (via ``osmo user roles list `` or IdP role mapping) +2. **Check role assignment**: Ensure the user has the role in OSMO (via ``osmo user get `` or IdP role mapping) 3. **Review resource scope**: Verify the policy's ``resources`` field matches the pool name (e.g., ``pool/my-pool`` or ``pool/*``) .. _actions_resources_reference: diff --git a/run/start_backend_bazel.py b/run/start_backend_bazel.py index ba7e4b5a0..56c8c0a38 100644 --- a/run/start_backend_bazel.py +++ b/run/start_backend_bazel.py @@ -138,8 +138,7 @@ def _start_backend_operator(service_type: Literal['listener', 'worker'], emoji: '--serviceURL', f'http://{host_ip}:8002', '--backend', 'default', '--namespace', 'default', - '--progressDir', '/tmp/osmo/operator/', - '--metrics_prometheus_port', '9467' + '--progressDir', '/tmp/osmo/operator/' ] else: cmd = [ @@ -150,8 +149,7 @@ def _start_backend_operator(service_type: Literal['listener', 'worker'], emoji: '--backend', 'default', '--namespace', 'default', '--username', 'testuser', - '--progress_folder_path', '/tmp/osmo/operator', - '--metrics_prometheus_port', '9468' + '--progress_folder_path', '/tmp/osmo/operator' ] process = run_command_with_logging( diff --git a/run/start_service_bazel.py b/run/start_service_bazel.py index 3305f5724..f78ade632 100644 --- a/run/start_service_bazel.py +++ b/run/start_service_bazel.py @@ -315,8 +315,7 @@ def _start_service_worker(): 'bazel', 'run', '@osmo_workspace//src/service/worker:worker_binary', '--', '--method=dev', - '--progress_file', '/tmp/osmo/service/last_progress_worker', - '--metrics_prometheus_port', '9465' + '--progress_file', '/tmp/osmo/service/last_progress_worker' ] process = run_command_with_logging( @@ -388,8 +387,7 @@ def _start_delayed_job_monitor(): '@osmo_workspace//src/service/delayed_job_monitor:delayed_job_monitor_binary', '--', '--method=dev', - '--progress_file', '/tmp/osmo/service/last_progress_delayed_job_monitor', - '--metrics_prometheus_port', '9466' + '--progress_file', '/tmp/osmo/service/last_progress_delayed_job_monitor' ] process = run_command_with_logging( diff --git a/src/cli/config.py b/src/cli/config.py index 9a7a632c8..cb5ca7699 100644 --- a/src/cli/config.py +++ b/src/cli/config.py @@ -324,12 +324,17 @@ def _fetch_data_from_config(config_info: Any) -> Any: return config_info -def _get_current_config(service_client: client.ServiceClient, config_type: str) -> Any: +def _get_current_config( + service_client: client.ServiceClient, + config_type: str, + params: Dict[str, Any] | None = None, +) -> Any: """ Get the current config Args: service_client: The service client instance config_type: The string config type from parsed arguments + params: Optional query parameters to include in the request """ if config_type not in [t.value for t in config_history.ConfigHistoryType]: raise osmo_errors.OSMOUserError( @@ -337,7 +342,7 @@ def _get_current_config(service_client: client.ServiceClient, config_type: str) f'Available types: {CONFIG_TYPES_STRING}' ) return service_client.request( - client.RequestMethod.GET, f'api/configs/{config_type.lower()}' + client.RequestMethod.GET, f'api/configs/{config_type.lower()}', params=params ) @@ -351,6 +356,10 @@ def _run_show_command(service_client: client.ServiceClient, args: argparse.Names # Parse the config identifier if ':' in args.config: # Format is : + if args.verbose: + raise osmo_errors.OSMOUserError( + '--verbose is not supported for historical revisions' + ) revision = config_history.ConfigHistoryRevision(args.config) params: Dict[str, Any] = { 'config_types': [revision.config_type.value], @@ -369,7 +378,12 @@ def _run_show_command(service_client: client.ServiceClient, args: argparse.Names data = result['configs'][0]['data'] else: # Format is - data = _get_current_config(service_client, args.config) + if args.verbose and args.config != config_history.ConfigHistoryType.POOL.value: + raise osmo_errors.OSMOUserError( + f'--verbose is only supported for POOL configs, not {args.config}' + ) + request_params: Dict[str, Any] | None = {'verbose': True} if args.verbose else None + data = _get_current_config(service_client, args.config, params=request_params) # Handle multiple name arguments for indexing if args.names: @@ -897,6 +911,12 @@ def setup_parser(parser: argparse._SubParsersAction): Show the ``user_workflow_limits`` workflow configuration in a previous revision:: osmo config show WORKFLOW:3 user_workflow_limits + +Show a pool configuration with parsed pod templates, group templates, and resource validations:: + + osmo config show POOL --verbose + + osmo config show POOL my-pool --verbose ''' ) show_parser.add_argument( @@ -909,6 +929,12 @@ def setup_parser(parser: argparse._SubParsersAction): nargs='*', help='Optional names/indices to index into the config. Can be used to show a named config.' ) + show_parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Show verbose output including parsed pod templates, group templates, and resource ' + 'validations. Only applicable when CONFIG_TYPE is POOL.', + ) show_parser.set_defaults(func=_run_show_command) diff --git a/src/cli/workflow.py b/src/cli/workflow.py index dcc9bb136..71e901f0f 100644 --- a/src/cli/workflow.py +++ b/src/cli/workflow.py @@ -259,7 +259,13 @@ def setup_parser(parser: argparse._SubParsersAction): list_parser.add_argument('--count', '-c', default=20, type=validation.positive_integer, - help='Display the given count of workflows. Default value is 20.') + help='Display the given count of workflows. Default value is 20. ' + 'Use --offset to skip results for pagination.') + list_parser.add_argument('--offset', '-f', + default=0, + type=validation.non_negative_integer, + help='Skip the first N workflows (newest first, server-side order). ' + 'Use with --count to paginate results. Default is 0.') list_parser.add_argument('--name', '-n', type=str, help='Display workflows which contains the string.') @@ -855,7 +861,8 @@ def _workflow_logs(service_client: client.ServiceClient, args: argparse.Namespac except requests.exceptions.ChunkedEncodingError as error: # Check if this is specifically the timeout case with InvalidChunkLength error_str = str(error) - if 'InvalidChunkLength' in error_str and "got length b''" in error_str: + if ('InvalidChunkLength' in error_str and "got length b''" in error_str) or \ + ('Response ended prematurely' in error_str): print('\nLog stream has timed out or failed. ' 'Please run the command again to continue viewing logs.') return @@ -885,7 +892,8 @@ def _workflow_events(service_client: client.ServiceClient, args: argparse.Namesp except requests.exceptions.ChunkedEncodingError as error: # Check if this is specifically the timeout case with InvalidChunkLength error_str = str(error) - if 'InvalidChunkLength' in error_str and "got length b''" in error_str: + if ('InvalidChunkLength' in error_str and "got length b''" in error_str) or \ + ('Response ended prematurely' in error_str): print('\nEvent stream has timed out or failed. ' 'Please run the command again to continue viewing events.') return @@ -1028,7 +1036,7 @@ def _list_workflows(service_client: client.ServiceClient, args: argparse.Namespa while True: count = min(args.count - current_count, 1000) params['limit'] = count - params['offset'] = current_count + params['offset'] = args.offset + current_count workflow_result = service_client.request( client.RequestMethod.GET, diff --git a/src/operator/backend_listener.py b/src/operator/backend_listener.py index e1f952f99..40a306b08 100644 --- a/src/operator/backend_listener.py +++ b/src/operator/backend_listener.py @@ -930,7 +930,7 @@ def calculate_pod_status(pod: Any) -> Tuple[task.TaskGroupStatus, str, Optional[ # When a container fails to create, the pod will not be Ready. # The lastTransitionTime of this condition is the closest timestamp. if condition.type == 'Ready' and condition.status == 'False': - now = datetime.datetime.now() + now = datetime.datetime.now(datetime.timezone.utc) last_transition_time = condition.last_transition_time if last_transition_time: time_diff = now - last_transition_time diff --git a/src/operator/backend_test_runner/daemonset_manager.py b/src/operator/backend_test_runner/daemonset_manager.py index 0a26874f7..6d0759d60 100644 --- a/src/operator/backend_test_runner/daemonset_manager.py +++ b/src/operator/backend_test_runner/daemonset_manager.py @@ -269,7 +269,7 @@ def delete_daemonset(self) -> None: name=pod.metadata.name, namespace=self.namespace, body=kb_client.V1DeleteOptions( - grace_period_seconds=0, + grace_period_seconds=30, propagation_policy='Background' ) ) @@ -281,6 +281,30 @@ def delete_daemonset(self) -> None: logging.error('Error during cleanup: %s', e) raise + def _wait_for_daemonset_deletion(self, timeout: int = 120) -> None: + """Poll until the daemonset is fully removed from the API server. + + After delete_daemonset() the object may still exist with a deletionTimestamp + while Kubernetes garbage-collects dependents. Attempting to create a new + daemonset with the same name before it is gone causes a 409 Conflict. + + Args: + timeout: Maximum seconds to wait for the daemonset to disappear. + """ + start = time.time() + while time.time() - start < timeout: + try: + self.apps_v1.read_namespaced_daemon_set( + name=self.name, namespace=self.namespace) + logging.info('Waiting for daemonset %s to be fully deleted...', self.name) + time.sleep(2) + except kb_client.rest.ApiException as e: + if e.status == 404: + logging.info('Daemonset %s fully deleted', self.name) + return + raise + logging.warning('Timed out waiting for daemonset %s deletion after %ds', self.name, timeout) + def deploy_and_wait(self) -> bool: """Deploy daemonset and wait for condition on all nodes. @@ -291,6 +315,7 @@ def deploy_and_wait(self) -> bool: try: # Clean up any existing resources with this name if exists self.delete_daemonset() + self._wait_for_daemonset_deletion() # Create daemonset self.create_daemonset() logging.info('Waiting for daemonset and conditions at %s', time.time()) diff --git a/src/operator/tests/test_pod_status_calculator.py b/src/operator/tests/test_pod_status_calculator.py index 00275ab8c..d6d3b9e58 100644 --- a/src/operator/tests/test_pod_status_calculator.py +++ b/src/operator/tests/test_pod_status_calculator.py @@ -34,18 +34,19 @@ def parse_time_string(time_str: str) -> datetime.datetime: - """Parse time strings like 'now', 'now-5m', 'now-15m' into datetime objects""" + """Parse time strings like 'now', 'now-5m', 'now-15m' into timezone-aware UTC datetimes.""" + utc_now = datetime.datetime.now(datetime.timezone.utc) if time_str == 'now': - return datetime.datetime.now() + return utc_now elif time_str.startswith('now-'): parts = time_str[4:] if parts.endswith('m'): minutes = int(parts[:-1]) - return datetime.datetime.now() - datetime.timedelta(minutes=minutes) + return utc_now - datetime.timedelta(minutes=minutes) elif parts.endswith('h'): hours = int(parts[:-1]) - return datetime.datetime.now() - datetime.timedelta(hours=hours) - return datetime.datetime.now() + return utc_now - datetime.timedelta(hours=hours) + return utc_now def create_pod_from_json(test_input: Dict) -> V1Pod: diff --git a/src/operator/utils/node_validation_test/connection_validator.py b/src/operator/utils/node_validation_test/connection_validator.py index ff2e52271..f3196c871 100644 --- a/src/operator/utils/node_validation_test/connection_validator.py +++ b/src/operator/utils/node_validation_test/connection_validator.py @@ -37,10 +37,12 @@ class URLTestConfig(pydantic.BaseModel): method: str = pydantic.Field(default='GET', description='HTTP method to use') timeout: int = pydantic.Field( default=30, description='Timeout in seconds for the connection test') - expected_status_code: int = pydantic.Field( - default=200, description='Expected HTTP status code') + expected_status_code: Optional[int] = pydantic.Field( + default=None, description='Expected HTTP status code (None means any non-5xx is success)') condition_name: Optional[str] = pydantic.Field( default='ServiceConnectionTestFailure', description='Custom condition name for this URL') + retriable_status_codes: List[int] = pydantic.Field( + default=[429, 503], description='Status codes that should trigger retry') class ConnectionTestConfig(test_base.NodeTestConfig): @@ -119,6 +121,13 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition Returns: NodeCondition on success, None on failure (to trigger retry/backoff). + + Status code handling: + - If expected_status_code is set, only that code is considered success + - If expected_status_code is None (default): + - Retriable codes (429, 503) trigger retry + - Any other 5xx indicates service is down + - All other codes (2xx, 3xx, 4xx) indicate service is reachable """ try: logging.info('Testing URL: %s', url_config.url) @@ -128,21 +137,50 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition timeout=url_config.timeout, ) - if response.status_code != url_config.expected_status_code: - logging.error( - 'Unexpected status code from %s: %s != %s', + status_code = response.status_code + + # If expected_status_code is explicitly set, use strict matching + if url_config.expected_status_code is not None: + if status_code != url_config.expected_status_code: + logging.error( + 'Unexpected status code from %s: %s != %s', + url_config.url, + status_code, + url_config.expected_status_code, + ) + return None + else: + # Check if status code is retriable (e.g., 429 rate limiting, 503 unavailable) + if status_code in url_config.retriable_status_codes: + logging.warning( + 'Retriable status code from %s: %s, will retry', + url_config.url, + status_code, + ) + return None + + # Any 5xx not already caught by retriable_status_codes is a service failure + if status_code >= 500: + logging.error( + 'Service failure status code from %s: %s', + url_config.url, + status_code, + ) + return None + + # Any other status code (2xx, 3xx, 4xx) means service is reachable + logging.info( + 'Service reachable at %s with status code %s', url_config.url, - response.status_code, - url_config.expected_status_code, + status_code, ) - return None logging.info('URL test passed: %s (%s)', url_config.url, url_config.condition_name) return test_base.NodeCondition( type=url_config.condition_name or self.config.condition_name, status='False', reason='ServiceConnectionSuccess', - message=f'Connection test passed: {url_config.url}', + message=f'Connection test passed: {url_config.url} (status: {status_code})', ) except requests.RequestException as e: logging.error('Connection test failed for %s: %s', url_config.url, str(e)) diff --git a/src/operator/utils/node_validation_test/test_base.py b/src/operator/utils/node_validation_test/test_base.py index 6504d7212..16d559efc 100644 --- a/src/operator/utils/node_validation_test/test_base.py +++ b/src/operator/utils/node_validation_test/test_base.py @@ -19,6 +19,7 @@ from datetime import datetime import logging +import signal import time from typing import Any, Dict, List, Optional @@ -28,6 +29,23 @@ from src.lib.utils import logging as logging_utils, osmo_errors from src.utils import static_config + +def _sigterm_handler(signum: int, frame: Any) -> None: # pylint: disable=unused-argument + """Convert SIGTERM into SystemExit so that finally blocks execute during pod termination.""" + logging.info('Received SIGTERM (signal %d), raising SystemExit for graceful cleanup', signum) + raise SystemExit(128 + signum) + + +def register_graceful_shutdown() -> None: + """Register a SIGTERM handler that triggers finally-block cleanup. + + Kubernetes sends SIGTERM before SIGKILL during pod termination. + Python's default SIGTERM handler terminates without running finally blocks. + This converts SIGTERM into SystemExit, which does trigger finally blocks, + allowing validators to clean up resources (e.g. benchmark pods) on shutdown. + """ + signal.signal(signal.SIGTERM, _sigterm_handler) + DEFAULT_NODE_CONDITION_PREFIX = 'osmo.nvidia.com/' @@ -138,6 +156,8 @@ def __init__(self, node_name: str, Args: node_name: Optional node name. If not provided, will be read from NODE_NAME env var. """ + register_graceful_shutdown() + # Load in-cluster config try: kb_config.load_incluster_config() diff --git a/src/service/agent/objects.py b/src/service/agent/objects.py index fc81ea343..b95c6383e 100644 --- a/src/service/agent/objects.py +++ b/src/service/agent/objects.py @@ -239,7 +239,8 @@ async def handle_job(self, job_spec: Dict): start_time=time.time()) # Do not Create the Group unless the status is Scheduling - pre_complete, message = job.prepare_execute( + pre_complete, message = await asyncio.to_thread( + job.prepare_execute, self.context, self._progress_writer, self._progress_iter_freq) if not pre_complete: result = jobs.JobResult( diff --git a/src/service/authz_sidecar/server/authz_server.go b/src/service/authz_sidecar/server/authz_server.go index 238c61de3..d31e0ffe6 100644 --- a/src/service/authz_sidecar/server/authz_server.go +++ b/src/service/authz_sidecar/server/authz_server.go @@ -40,6 +40,8 @@ const ( // Header names headerOsmoUser = "x-osmo-user" headerOsmoRoles = "x-osmo-roles" + headerOsmoTokenName = "x-osmo-token-name" + headerOsmoWorkflowID = "x-osmo-workflow-id" headerOsmoAllowedPools = "x-osmo-allowed-pools" // Default role added to all users @@ -134,9 +136,11 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec method := httpAttrs.GetMethod() headers := httpAttrs.GetHeaders() - // Extract user and roles from headers + // Extract user, roles, and token/workflow identifiers from headers user := headers[headerOsmoUser] rolesHeader := headers[headerOsmoRoles] + tokenName := headers[headerOsmoTokenName] + workflowID := headers[headerOsmoWorkflowID] // Parse roles (comma-separated) var roleNames []string @@ -153,7 +157,9 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec // complete set of assigned OSMO roles in a single atomic operation. // This maps external role names (from the JWT) to OSMO roles via // role_external_mappings and applies sync_mode logic (import/force). - if user != "" { + // Skip sync for access tokens and workflow-originated requests, as their + // roles are already resolved from the access_token_roles table. + if user != "" && tokenName == "" && workflowID == "" { dbRoleNames, err := roles.SyncUserRoles(ctx, s.pgClient, user, roleNames, s.logger) if err != nil { s.logger.Error("failed to sync user roles", @@ -173,6 +179,8 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec slog.String("user", user), slog.String("path", path), slog.String("method", method), + slog.String("token_name", tokenName), + slog.String("workflow_id", workflowID), slog.Any("roles", roleNames), ) @@ -181,6 +189,8 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec if err != nil { s.logger.Error("error resolving roles", slog.String("user", user), + slog.String("token_name", tokenName), + slog.String("workflow_id", workflowID), slog.String("error", err.Error()), slog.Any("roles", roleNames), ) @@ -199,6 +209,8 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec slog.String("user", user), slog.String("path", path), slog.String("method", method), + slog.String("token_name", tokenName), + slog.String("workflow_id", workflowID), slog.Any("roles", roleNames), slog.Duration("parse", parseDone.Sub(checkStart)), slog.Duration("sync_roles", syncDone.Sub(parseDone)), @@ -213,6 +225,8 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec slog.String("user", user), slog.String("path", path), slog.String("method", method), + slog.String("token_name", tokenName), + slog.String("workflow_id", workflowID), ) responseHeaders := map[string]string{} @@ -229,6 +243,8 @@ func (s *AuthzServer) Check(ctx context.Context, req *envoy_service_auth_v3.Chec slog.String("user", user), slog.String("path", path), slog.String("method", method), + slog.String("token_name", tokenName), + slog.String("workflow_id", workflowID), slog.Duration("parse", parseDone.Sub(checkStart)), slog.Duration("sync_roles", syncDone.Sub(parseDone)), slog.Duration("resolve_roles", resolveDone.Sub(syncDone)), diff --git a/src/service/core/tests/BUILD b/src/service/core/tests/BUILD index 3ef3737d3..f0163dd45 100644 --- a/src/service/core/tests/BUILD +++ b/src/service/core/tests/BUILD @@ -40,6 +40,25 @@ osmo_py_test( visibility = ["//visibility:public"], ) +osmo_py_test( + name = "test_group_templates", + srcs = ["test_group_templates.py"], + deps = [ + ":fixture", + "//src/lib/utils:common", + "//src/lib/utils:osmo_errors", + "//src/lib/utils:priority", + "//src/service/core:service_lib", + "//src/tests/common", + "//src/tests/common/storage", + ], + size = "large", + tags = [ + "requires-network", + ], + visibility = ["//visibility:public"], +) + osmo_py_library( name = "fixture", srcs = ["fixture.py"], diff --git a/src/service/core/tests/fixture.py b/src/service/core/tests/fixture.py index fd73ea345..70a284188 100644 --- a/src/service/core/tests/fixture.py +++ b/src/service/core/tests/fixture.py @@ -19,6 +19,7 @@ import logging import io import time +from typing import Any, Dict from fastapi import testclient @@ -117,15 +118,19 @@ def tearDown(self): super().tearDown() - def create_test_backend(self, backend_name='test_backend'): + def create_test_backend(self, database=None, backend_name='test_backend'): """Helper function to create a test backend. Args: + database: Database connector instance. If None, gets the current instance. backend_name: Name of the backend to create. Returns: The created backend configuration """ + if database is None: + database = connectors.postgres.PostgresConnector.get_instance() + backend = { 'k8s_uid': 'test_uid', 'k8s_namespace': 'test_namespace', @@ -133,11 +138,25 @@ def create_test_backend(self, backend_name='test_backend'): 'node_condition_prefix': 'test.osmo.nvidia.com/', } agent_helpers.create_backend( - backend_name, backend_messages.InitBody(**backend)) + database, backend_name, backend_messages.InitBody(**backend)) + + def create_test_group_template(self, name: str, group_template: Dict[str, Any]) -> None: + """Helper function to create a group template. + + Args: + name: Name of the group template + group_template: The group template dict (must contain apiVersion, kind, metadata.name) + """ + config_service.put_group_template( + name=name, + request=config_objects.PutGroupTemplateRequest(configs=group_template), + username='test@nvidia.com', + ) def create_test_pool(self, pool_name='test_pool', description='test_description', default_platform='test_platform', backend='test_backend', - common_pod_template=None, enable_maintenance=False): + common_pod_template=None, common_group_templates=None, + enable_maintenance=False): """Helper function to create a test pool with configurable parameters. Args: @@ -146,6 +165,7 @@ def create_test_pool(self, pool_name='test_pool', description='test_description' default_platform: Default platform for the pool backend: Backend for the pool common_pod_template: List of pod templates to use (defaults to None) + common_group_templates: List of group template names to use (defaults to None) enable_maintenance: Whether maintenance mode is enabled Returns: @@ -165,6 +185,9 @@ def create_test_pool(self, pool_name='test_pool', description='test_description' if common_pod_template: pool_config['common_pod_template'] = common_pod_template + if common_group_templates: + pool_config['common_group_templates'] = common_group_templates + config_service.put_pool( name=pool_name, request=config_objects.PutPoolRequest( @@ -174,10 +197,8 @@ def create_test_pool(self, pool_name='test_pool', description='test_description' ) return pool_config - def create_task_group(self): + def create_task_group(self, database): """Helper function to create a task group for token substitution testing.""" - context = objects.WorkflowServiceContext.get() - database = context.database # Create workflow record in database workflow_uuid = common.generate_unique_id() cmd = ''' diff --git a/src/service/core/tests/test_group_templates.py b/src/service/core/tests/test_group_templates.py new file mode 100644 index 000000000..a7037a761 --- /dev/null +++ b/src/service/core/tests/test_group_templates.py @@ -0,0 +1,483 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import copy +import datetime +import os +import tempfile +import unittest +from typing import Any, Dict + +from src.lib.utils import common, osmo_errors, priority as wf_priority +from src.service.core.config import config_service, objects as config_objects +from src.service.core.tests import fixture as service_fixture +from src.utils import connectors +from src.utils.job import common as task_common, task +from src.utils.progress_check import progress +from src.tests.common import runner + + +# A minimal valid ComputeDomain group template used across tests +_COMPUTE_DOMAIN_TEMPLATE: Dict[str, Any] = { + 'apiVersion': 'resource.nvidia.com/v1beta1', + 'kind': 'ComputeDomain', + 'metadata': { + 'name': 'compute-domain-{{WF_GROUP_UUID}}', + }, + 'spec': { + 'channel': { + 'resourceClaimTemplate': { + 'name': 'compute-domain-{{WF_GROUP_UUID}}', + } + } + }, +} + + +class GroupTemplateRenderTest(unittest.TestCase): + """Unit tests for render_group_templates (no DB required).""" + + def _compute_domain_template(self, name_token: str = '{{WF_GROUP_UUID}}') -> Dict[str, Any]: + return { + 'apiVersion': 'resource.nvidia.com/v1beta1', + 'kind': 'ComputeDomain', + 'metadata': { + 'name': f'compute-domain-{name_token}', + }, + 'spec': { + 'channel': { + 'resourceClaimTemplate': { + 'name': f'compute-domain-{name_token}', + } + } + } + } + + def test_basic_variable_substitution(self): + """Variables in template fields are substituted with provided values.""" + group_uuid = 'abc-123' + templates = [self._compute_domain_template()] + result = task.render_group_templates(templates, {'WF_GROUP_UUID': group_uuid}, {}) + + self.assertEqual(result[0]['metadata']['name'], f'compute-domain-{group_uuid}') + self.assertEqual( + result[0]['spec']['channel']['resourceClaimTemplate']['name'], + f'compute-domain-{group_uuid}', + ) + + def test_namespace_stripped(self): + """metadata.namespace is removed from the rendered output.""" + templates = [{ + 'apiVersion': 'v1', + 'kind': 'ConfigMap', + 'metadata': {'name': 'my-cm', 'namespace': 'user-namespace'}, + }] + result = task.render_group_templates(templates, {}, {}) + self.assertNotIn('namespace', result[0]['metadata']) + + def test_osmo_labels_injected(self): + """OSMO labels are added into metadata.labels of the rendered resource.""" + templates = [{'apiVersion': 'v1', 'kind': 'ConfigMap', 'metadata': {'name': 'my-cm'}}] + labels = {'osmo.group_uuid': 'grp-1', 'osmo.workflow_uuid': 'wf-1'} + result = task.render_group_templates(templates, {}, labels) + + self.assertEqual(result[0]['metadata']['labels']['osmo.group_uuid'], 'grp-1') + self.assertEqual(result[0]['metadata']['labels']['osmo.workflow_uuid'], 'wf-1') + + def test_existing_labels_preserved_and_merged(self): + """Pre-existing metadata.labels on the template are kept alongside injected OSMO labels.""" + templates = [{ + 'apiVersion': 'v1', + 'kind': 'ConfigMap', + 'metadata': {'name': 'my-cm', 'labels': {'custom-key': 'custom-value'}}, + }] + labels = {'osmo.group_uuid': 'grp-1'} + result = task.render_group_templates(templates, {}, labels) + + self.assertEqual(result[0]['metadata']['labels']['custom-key'], 'custom-value') + self.assertEqual(result[0]['metadata']['labels']['osmo.group_uuid'], 'grp-1') + + def test_input_templates_not_mutated(self): + """The original templates list and its contents are unchanged after rendering.""" + templates = [self._compute_domain_template()] + original = copy.deepcopy(templates) + task.render_group_templates(templates, {'WF_GROUP_UUID': 'xyz'}, {'osmo.group_uuid': 'g'}) + self.assertEqual(templates, original) + + def test_multiple_templates_all_rendered(self): + """All templates in the input list are rendered and returned.""" + group_uuid = 'grp-42' + templates = [ + self._compute_domain_template(), + { + 'apiVersion': 'v1', + 'kind': 'Secret', + 'metadata': {'name': 'secret-{{WF_GROUP_UUID}}'}, + }, + ] + result = task.render_group_templates(templates, {'WF_GROUP_UUID': group_uuid}, {}) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0]['metadata']['name'], f'compute-domain-{group_uuid}') + self.assertEqual(result[1]['metadata']['name'], f'secret-{group_uuid}') + + def test_empty_templates_returns_empty_list(self): + """An empty template list returns an empty list without error.""" + result = task.render_group_templates([], {'WF_GROUP_UUID': 'grp-1'}, {}) + self.assertEqual(result, []) + + +class GroupTemplateTest(service_fixture.ServiceTestFixture): + """DB-backed tests for group template CRUD, pool assignment, and KB spec generation.""" + + def setUp(self): + super().setUp() + self.database = connectors.PostgresConnector.get_instance() + + # --- CRUD tests --- + + def test_put_and_get_single_template(self): + """PUT a group template via API, then GET it and verify the body matches.""" + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + + response = self.client.get('/api/configs/group_template/compute-domain') + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), _COMPUTE_DOMAIN_TEMPLATE) + + def test_list_group_templates_returns_all(self): + """PUT two group templates, then LIST returns both by name.""" + second_template = { + 'apiVersion': 'v1', + 'kind': 'ConfigMap', + 'metadata': {'name': 'shared-config'}, + } + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_group_template('shared-config', second_template) + + response = self.client.get('/api/configs/group_template') + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertIsInstance(body, dict) + self.assertIn('compute-domain', body) + self.assertIn('shared-config', body) + + def test_put_bulk_group_templates(self): + """PUT multiple templates in one request via the bulk endpoint.""" + second_template = { + 'apiVersion': 'v1', + 'kind': 'ConfigMap', + 'metadata': {'name': 'shared-config'}, + } + config_service.put_group_templates( + request=config_objects.PutGroupTemplatesRequest(configs={ + 'compute-domain': _COMPUTE_DOMAIN_TEMPLATE, + 'shared-config': second_template, + }), + username='test@nvidia.com', + ) + + response = self.client.get('/api/configs/group_template') + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertIn('compute-domain', body) + self.assertIn('shared-config', body) + + def test_put_overwrites_existing_template(self): + """PUT an existing template name replaces it with the new body.""" + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + + updated = copy.deepcopy(_COMPUTE_DOMAIN_TEMPLATE) + updated['metadata']['name'] = 'compute-domain-updated' + self.create_test_group_template('compute-domain', updated) + + response = self.client.get('/api/configs/group_template/compute-domain') + self.assertEqual(response.json()['metadata']['name'], 'compute-domain-updated') + + def test_delete_unused_template_succeeds(self): + """DELETE a template that is not assigned to any pool removes it from the DB.""" + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + + config_service.delete_group_template( + name='compute-domain', + request=config_objects.ConfigsRequest(), + username='test@nvidia.com', + ) + + response = self.client.get('/api/configs/group_template/compute-domain') + self.assertEqual(response.status_code, 400) + + def test_delete_template_in_use_raises_error(self): + """DELETE a template referenced by a pool raises OSMOUserError with pool name in message.""" + self.create_test_backend(self.database) + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['compute-domain'], + ) + + with self.assertRaises(osmo_errors.OSMOUserError) as context: + config_service.delete_group_template( + name='compute-domain', + request=config_objects.ConfigsRequest(), + username='test@nvidia.com', + ) + self.assertIn('nvlink-pool', str(context.exception)) + + def test_template_missing_api_version_raises_error(self): + """A template without apiVersion raises OSMOUserError on insert.""" + invalid = {'kind': 'ComputeDomain', 'metadata': {'name': 'cd-1'}} + with self.assertRaises(osmo_errors.OSMOUserError): + connectors.GroupTemplate(group_template=invalid).insert_into_db( + self.database, 'invalid-template') + + def test_template_missing_kind_raises_error(self): + """A template without kind raises OSMOUserError on insert.""" + invalid = {'apiVersion': 'v1', 'metadata': {'name': 'cd-1'}} + with self.assertRaises(osmo_errors.OSMOUserError): + connectors.GroupTemplate(group_template=invalid).insert_into_db( + self.database, 'invalid-template') + + def test_template_missing_metadata_name_raises_error(self): + """A template without metadata.name raises OSMOUserError on insert.""" + invalid = {'apiVersion': 'v1', 'kind': 'ConfigMap', 'metadata': {}} + with self.assertRaises(osmo_errors.OSMOUserError): + connectors.GroupTemplate(group_template=invalid).insert_into_db( + self.database, 'invalid-template') + + def test_template_with_namespace_raises_error(self): + """A template with metadata.namespace raises OSMOUserError on insert.""" + invalid = copy.deepcopy(_COMPUTE_DOMAIN_TEMPLATE) + invalid['metadata']['namespace'] = 'user-namespace' + with self.assertRaises(osmo_errors.OSMOUserError) as context: + connectors.GroupTemplate(group_template=invalid).insert_into_db( + self.database, 'namespaced-template') + self.assertIn('namespace', str(context.exception).lower()) + + # --- Pool group template assignment tests --- + + def test_pool_with_single_group_template(self): + """Pool assigned one group template has it in parsed_group_templates.""" + self.create_test_backend(self.database) + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['compute-domain'], + ) + + pool = connectors.Pool.fetch_from_db(self.database, 'nvlink-pool') + self.assertEqual(len(pool.parsed_group_templates), 1) + self.assertEqual( + pool.parsed_group_templates[0]['metadata']['name'], + _COMPUTE_DOMAIN_TEMPLATE['metadata']['name'], + ) + + def test_pool_with_multiple_distinct_templates(self): + """Pool assigned two templates with different kinds has both in parsed_group_templates.""" + second_template = { + 'apiVersion': 'v1', + 'kind': 'ConfigMap', + 'metadata': {'name': 'shared-config'}, + } + self.create_test_backend(self.database) + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_group_template('shared-config', second_template) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['compute-domain', 'shared-config'], + ) + + pool = connectors.Pool.fetch_from_db(self.database, 'nvlink-pool') + self.assertEqual(len(pool.parsed_group_templates), 2) + kinds = {t['kind'] for t in pool.parsed_group_templates} + self.assertIn('ComputeDomain', kinds) + self.assertIn('ConfigMap', kinds) + + def test_pool_group_templates_merged_on_same_resource_key(self): + """Two templates with the same (apiVersion, kind, metadata.name) are merged.""" + base_template = { + 'apiVersion': 'resource.nvidia.com/v1beta1', + 'kind': 'ComputeDomain', + 'metadata': {'name': 'compute-domain-shared'}, + 'spec': {'channel': {'mode': 'single'}}, + } + patch_template = { + 'apiVersion': 'resource.nvidia.com/v1beta1', + 'kind': 'ComputeDomain', + 'metadata': {'name': 'compute-domain-shared'}, + 'spec': {'extra': 'value'}, + } + self.create_test_backend(self.database) + self.create_test_group_template('base-cd', base_template) + self.create_test_group_template('patch-cd', patch_template) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['base-cd', 'patch-cd'], + ) + + pool = connectors.Pool.fetch_from_db(self.database, 'nvlink-pool') + self.assertEqual(len(pool.parsed_group_templates), 1) + merged = pool.parsed_group_templates[0] + self.assertEqual(merged['spec']['channel']['mode'], 'single') + self.assertEqual(merged['spec']['extra'], 'value') + + def test_pool_parsed_templates_updated_when_template_changes(self): + """After updating a group template, pool's parsed_group_templates reflects the change.""" + self.create_test_backend(self.database) + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['compute-domain'], + ) + + updated = copy.deepcopy(_COMPUTE_DOMAIN_TEMPLATE) + updated['spec']['channel']['resourceClaimTemplate']['name'] = 'updated-name' + self.create_test_group_template('compute-domain', updated) + + pool = connectors.Pool.fetch_from_db(self.database, 'nvlink-pool') + self.assertEqual( + pool.parsed_group_templates[0]['spec']['channel']['resourceClaimTemplate']['name'], + 'updated-name', + ) + + def test_pool_with_no_group_templates(self): + """Pool with empty common_group_templates has an empty parsed_group_templates.""" + self.create_test_backend(self.database) + self.create_test_pool(pool_name='plain-pool', backend='test_backend') + + pool = connectors.Pool.fetch_from_db(self.database, 'plain-pool') + self.assertEqual(pool.parsed_group_templates, []) + + def test_pool_nonexistent_group_template_raises_error(self): + """Assigning a non-existent group template name to a pool raises an error.""" + self.create_test_backend(self.database) + with self.assertRaises(osmo_errors.OSMOUsageError): + self.create_test_pool( + pool_name='bad-pool', + backend='test_backend', + common_group_templates=['does-not-exist'], + ) + + # --- KB spec generation tests --- + + def _setup_for_kb_specs(self): + """Set up backend, workflow config, group template, pool, and task group.""" + self.create_test_backend(self.database) + + config_service.put_workflow_configs( + request=config_objects.PutWorkflowRequest(configs=connectors.WorkflowConfig( + workflow_data={ + 'credential': { + 'endpoint': 's3://bucket.io/AUTH_test/workflows', + 'access_key_id': 'test', + 'access_key': 'test_key', + 'region': 'us-east-1', + }, + }, + )), + username='test@nvidia.com', + ) + + tmpdir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with + self.addCleanup(tmpdir.cleanup) + self._progress_writer = progress.ProgressWriter( + os.path.join(tmpdir.name, 'progress.txt')) + + self.create_test_group_template('compute-domain', _COMPUTE_DOMAIN_TEMPLATE) + self.create_test_pool( + pool_name='nvlink-pool', + backend='test_backend', + common_group_templates=['compute-domain'], + ) + self.task_group = self.create_task_group(self.database) + + def _run_get_kb_specs(self, pool_name: str, task_group: task.TaskGroup): + """Invoke get_kb_specs with standard test arguments.""" + workflow_config = self.database.get_workflow_configs() + backend_config_cache = connectors.BackendConfigCache() + return task_group.get_kb_specs( + workflow_uuid=common.generate_unique_id(), + user='test@nvidia.com', + workflow_config=workflow_config, + backend_config_cache=backend_config_cache, + backend_name='test_backend', + pool=pool_name, + progress_writer=self._progress_writer, + progress_iter_freq=datetime.timedelta(minutes=1), + workflow_plugins=task_common.WorkflowPlugins(), + priority=wf_priority.WorkflowPriority.NORMAL, + ) + + def test_group_template_resources_prepended(self): + """Group template resources appear before pod/secret resources in kb_resources.""" + self._setup_for_kb_specs() + kb_resources, _ = self._run_get_kb_specs('nvlink-pool', self.task_group) + + self.assertGreater(len(kb_resources), 0) + first_resource = kb_resources[0] + self.assertEqual(first_resource['apiVersion'], _COMPUTE_DOMAIN_TEMPLATE['apiVersion']) + self.assertEqual(first_resource['kind'], _COMPUTE_DOMAIN_TEMPLATE['kind']) + + def test_group_template_variable_substitution_in_kb_specs(self): + """WF_GROUP_UUID token in the template name is replaced with the actual group UUID.""" + self._setup_for_kb_specs() + kb_resources, _ = self._run_get_kb_specs('nvlink-pool', self.task_group) + + rendered_name = kb_resources[0]['metadata']['name'] + self.assertNotIn('{{', rendered_name) + self.assertIn(self.task_group.group_uuid, rendered_name) + + def test_group_template_osmo_labels_on_rendered_resource(self): + """OSMO labels (osmo.group_uuid, osmo.workflow_uuid, etc.) are present on the + rendered resource.""" + self._setup_for_kb_specs() + kb_resources, _ = self._run_get_kb_specs('nvlink-pool', self.task_group) + + rendered_labels = kb_resources[0]['metadata']['labels'] + self.assertIn('osmo.group_uuid', rendered_labels) + self.assertIn('osmo.workflow_uuid', rendered_labels) + self.assertTrue(rendered_labels['osmo.group_uuid']) + + def test_group_template_resource_types_recorded_on_task_group(self): + """After get_kb_specs, group_template_resource_types on the TaskGroup is populated.""" + self._setup_for_kb_specs() + self._run_get_kb_specs('nvlink-pool', self.task_group) + + self.assertEqual(len(self.task_group.group_template_resource_types), 1) + recorded = self.task_group.group_template_resource_types[0] + self.assertEqual(recorded['apiVersion'], _COMPUTE_DOMAIN_TEMPLATE['apiVersion']) + self.assertEqual(recorded['kind'], _COMPUTE_DOMAIN_TEMPLATE['kind']) + + def test_no_group_templates_no_prepended_resources(self): + """A pool with no group templates does not prepend any extra resources.""" + self._setup_for_kb_specs() + self.create_test_pool(pool_name='plain-pool', backend='test_backend') + + kb_resources, _ = self._run_get_kb_specs('plain-pool', self.task_group) + + self.assertEqual(self.task_group.group_template_resource_types, []) + for resource in kb_resources: + self.assertNotEqual(resource.get('kind'), 'ComputeDomain') + + +if __name__ == '__main__': + runner.run_test() diff --git a/src/service/core/tests/test_service.py b/src/service/core/tests/test_service.py index d5ef19d34..8ad9ba47c 100644 --- a/src/service/core/tests/test_service.py +++ b/src/service/core/tests/test_service.py @@ -38,6 +38,7 @@ class ServiceTestCase(service_fixture.ServiceTestFixture): """ def test_get_default_pool(self): + database = connectors.postgres.PostgresConnector.get_instance() response = self.client.get('/api/configs/pool?verbose=true') pools = response.json()['pools'] @@ -61,7 +62,7 @@ def test_get_default_pool(self): config_objects.DEFAULT_VARIABLES, pools['default']['common_default_variables']) # Patch the pool with new backend name - self.create_test_backend(backend_name='my_backend') + self.create_test_backend(database, backend_name='my_backend') config_service.patch_pool( name='default', request=config_objects.PatchPoolRequest( @@ -229,8 +230,10 @@ def test_update_pool_labels(self): Test that the pool labels are updated when the pod spec with different node selector is updated. ''' + database = connectors.postgres.PostgresConnector.get_instance() + # Create test backend - self.create_test_backend() + self.create_test_backend(database) resource_spec = { 'hostname': 'test_host', @@ -244,7 +247,7 @@ def test_update_pool_labels(self): }, } agent_helpers.update_resource( - 'test_backend', backend_messages.UpdateNodeBody(**resource_spec)) + database, 'test_backend', backend_messages.ResourceBody(**resource_spec)) pod_template = { 'spec': { 'nodeSelector': { @@ -299,7 +302,8 @@ def test_patch_pool_config(self): ''' Simple test for patching the pool config. ''' - self.create_test_backend(backend_name='test_backend') + database = connectors.postgres.PostgresConnector.get_instance() + self.create_test_backend(database, backend_name='test_backend') # Use the helper function to create the pool self.create_test_pool(pool_name='test_pool', backend='test_backend') @@ -338,9 +342,10 @@ def test_substitute_tokens(self): Test that the tokens are substituted correctly in the pod spec. ''' # Create a dummy TaskGroup object for testing token substitution + database = connectors.PostgresConnector.get_instance() # Setup backend - self.create_test_backend() + self.create_test_backend(database) # Setup workflow configs workflow_configs = connectors.WorkflowConfig(**{ @@ -413,7 +418,7 @@ def test_substitute_tokens(self): ) # Create task group and convert to pod specs - task_group = self.create_task_group() + task_group = self.create_task_group(database) pods, _, _ = task_group.convert_all_pod_specs( common.generate_unique_id(), 'test@nvidia.com', diff --git a/src/service/core/workflow/tests/BUILD b/src/service/core/workflow/tests/BUILD index 27ee6af42..a1052bb15 100644 --- a/src/service/core/workflow/tests/BUILD +++ b/src/service/core/workflow/tests/BUILD @@ -46,3 +46,11 @@ py_test( ], ) +py_test( + name = "test_redact_secrets", + srcs = ["test_redact_secrets.py"], + deps = [ + "//src/service/core/workflow", + ], +) + diff --git a/src/service/core/workflow/tests/test_calculate_pool_quotas.py b/src/service/core/workflow/tests/test_calculate_pool_quotas.py index 6ee0af949..e5d87cbfd 100644 --- a/src/service/core/workflow/tests/test_calculate_pool_quotas.py +++ b/src/service/core/workflow/tests/test_calculate_pool_quotas.py @@ -454,6 +454,31 @@ def test_pools_with_no_resources(self): self.assertEqual(usage.total_free, '0') self.assertEqual(usage.quota_limit, '0') + def test_multiple_pools_with_no_resources_same_backend(self): + """Multiple pools on the same backend with no nodes should each appear + as their own entry in the response, not silently overwrite each other.""" + pool_configs = { + 'pool-a': make_pool_config('pool-a', 'k8s-1'), + 'pool-b': make_pool_config('pool-b', 'k8s-1'), + 'pool-c': make_pool_config('pool-c', 'k8s-1'), + } + + response = calculate_pool_quotas(pool_configs, [], []) + + # All three pools should be present in the response + pool_names = { + pool.name + for node_set in response.node_sets + for pool in node_set.pools + } + self.assertEqual(pool_names, {'pool-a', 'pool-b', 'pool-c'}) + + for pool_name in ('pool-a', 'pool-b', 'pool-c'): + usage = get_pool_usage(response, pool_name) + self.assertEqual(usage.total_capacity, '0') + self.assertEqual(usage.total_free, '0') + self.assertEqual(usage.quota_limit, '0') + def test_no_tasks(self): """No running tasks should show zero usage.""" pool_configs = {'pool-a': make_pool_config('pool-a', 'k8s-1')} diff --git a/src/service/core/workflow/tests/test_redact_secrets.py b/src/service/core/workflow/tests/test_redact_secrets.py new file mode 100644 index 000000000..5351c6dea --- /dev/null +++ b/src/service/core/workflow/tests/test_redact_secrets.py @@ -0,0 +1,98 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" +import base64 +import textwrap +import unittest + +from src.service.core.workflow.workflow_service import redact_secrets + + +# The AWS keys used below are the well-known example credentials from the AWS documentation +# and pose no security risk. +_AWS_ACCESS_KEY = 'AKIAIOSFODNN7EXAMPLE' +_AWS_SECRET_KEY = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + +_SPEC_WITH_SECRETS = textwrap.dedent(f'''\ + workflow: + name: "test" + resources: + default: + gpu: 0 + cpu: 1 + storage: 1Gi + memory: 1Gi + tasks: + - name: task + image: amazon/aws-cli + command: [bash] + args: [/tmp/run.sh] + files: + - path: /tmp/run.sh + contents: | + AWS_ACCESS_KEY_ID={_AWS_ACCESS_KEY} AWS_SECRET_ACCESS_KEY={_AWS_SECRET_KEY} aws s3 cp s3://testbucket +''') + + +def _redact(spec: str) -> str: + return ''.join(redact_secrets(spec.splitlines(keepends=True))) + + +class TestRedactSecretsPlaintext(unittest.TestCase): + """redact_secrets correctly handles plaintext key=value secrets.""" + + def test_redacts_aws_access_key_id(self): + redacted = _redact(_SPEC_WITH_SECRETS) + self.assertNotIn(_AWS_ACCESS_KEY, redacted) + self.assertIn('[MASKED]', redacted) + + def test_redacts_aws_secret_access_key(self): + redacted = _redact(_SPEC_WITH_SECRETS) + self.assertNotIn(_AWS_SECRET_KEY, redacted) + self.assertIn('[MASKED]', redacted) + + def test_preserves_non_secret_content(self): + redacted = _redact(_SPEC_WITH_SECRETS) + self.assertIn('name: "test"', redacted) + self.assertIn('image: amazon/aws-cli', redacted) + self.assertIn('s3://testbucket', redacted) + + +class TestRedactSecretsBase64(unittest.TestCase): + """redact_secrets detects and redacts secrets hidden inside base64 blobs.""" + + def test_redacts_base64_encoded_secret(self): + encoded = base64.b64encode(f'AWS_ACCESS_KEY_ID={_AWS_ACCESS_KEY}'.encode()).decode() + spec = f'workflow:\n name: test\n config: {encoded}\n' + + redacted = _redact(spec) + + self.assertNotIn(encoded, redacted) + self.assertIn('[MASKED]', redacted) + + def test_leaves_safe_base64_untouched(self): + safe_text = 'this is completely safe content with no credentials at all' + encoded = base64.b64encode(safe_text.encode()).decode() + spec = f'workflow:\n name: test\n data: {encoded}\n' + + redacted = _redact(spec) + + self.assertIn(encoded, redacted) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/service/core/workflow/workflow_service.py b/src/service/core/workflow/workflow_service.py index 1d6179789..5c2035d1f 100644 --- a/src/service/core/workflow/workflow_service.py +++ b/src/service/core/workflow/workflow_service.py @@ -17,6 +17,7 @@ SPDX-License-Identifier: Apache-2.0 """ +import base64 import collections import dataclasses import datetime @@ -25,7 +26,7 @@ import json import logging import re -from typing import Any, AsyncGenerator, Dict, Generator, List, Optional +from typing import Any, AsyncGenerator, Dict, Generator, Iterable, List, Optional import urllib.parse import yaml @@ -48,6 +49,48 @@ FETCH_TASK_LIMIT = 1000 +# Regex to match secrets in the spec. While this is not a perfect solution, it solves the majority +# of cases. +# Regex from: https://lookingatcomputer.substack.com/p/regex-is-almost-all-you-need +# Proper secret management: +# https://nvidia.github.io/OSMO/main/user_guide/getting_started/credentials.html +SECRET_REDACTION_RE = re.compile( + r'''(?i)[\w.-]{0,50}?(?:access|auth|(?-i:[Aa]pi|API)|credential|creds|key|passw(?:or)?d|secret|token)(?:[ \t\w.-]{0,20})[\s'"]{0,3}(?:=|>|:{1,3}=|\|\||:|=>|\?=|,)[\x60'"\s=]{0,5}([\w.=-]{10,150}|[a-z0-9][a-z0-9+/]{11,}={0,3})(?:[\x60'"\s;]|\\[nr]|$)''' # pylint: disable=line-too-long +) + +# Matches base64-encoded fragments: at least 16 chars of base64 alphabet with optional padding, +# not adjacent to other base64 characters (to capture complete tokens). +_BASE64_FRAGMENT_RE = re.compile(r'(? Generator[str, None, None]: + """ Yield lines with secrets in the spec redacted. """ + def redact_base64_fragments(line: str) -> str: + """ + Find base64-encoded fragments in a line, decode them, redact any secrets found inside, + and replace the whole fragment with [MASKED]. + """ + def replace_if_secrets(m: re.Match) -> str: + fragment = m.group(0) + try: + padded = fragment + '=' * (-len(fragment) % 4) + decoded = base64.b64decode(padded, validate=True).decode('utf-8') + except (ValueError, UnicodeDecodeError): + return fragment + redacted = SECRET_REDACTION_RE.sub( + lambda sm: sm.group(0).replace(sm.group(1), '[MASKED]'), + decoded, + ) + if redacted == decoded: + return fragment + return '[MASKED]' + return _BASE64_FRAGMENT_RE.sub(replace_if_secrets, line) + + for line in lines: + line = redact_base64_fragments(line) + yield SECRET_REDACTION_RE.sub( + lambda m: m.group(0).replace(m.group(1), '[MASKED]'), line) + class ActionType(enum.Enum): EXEC = 'exec' @@ -210,7 +253,11 @@ def calculate_pool_quotas( merged_nodes: frozenset[str] = frozenset().union( *(node_sets[pool].nodes for pool in component_pools) ) - pools_by_nodeset[NodeSet(start_nodeset.backend, merged_nodes)] = component_pools + nodeset_key = NodeSet(start_nodeset.backend, merged_nodes) + if nodeset_key in pools_by_nodeset: + pools_by_nodeset[nodeset_key].extend(component_pools) + else: + pools_by_nodeset[nodeset_key] = component_pools gpu_usage_by_nodeset = { nodeset: sum((node_gpu_usage.get(node, NodeGpuUsage()) for node in nodeset.nodes), @@ -881,8 +928,10 @@ def download_workflow_spec(workflow_id: str, use_template: bool = False): @router.get('/api/workflow/{name}/spec', response_class=fastapi.responses.PlainTextResponse) def get_workflow_spec(name: str, use_template: bool = False) -> Any: """ Returns the workflow spec. """ - return fastapi.responses.StreamingResponse(download_workflow_spec(name, use_template)) - + return fastapi.responses.StreamingResponse( + redact_secrets(download_workflow_spec(name, use_template)), + media_type='application/yaml' + ) @router.post('/api/workflow/{name}/tag') def tag_workflow(name: str, diff --git a/src/ui/CLAUDE.md b/src/ui/CLAUDE.md index 163180517..de492339e 100644 --- a/src/ui/CLAUDE.md +++ b/src/ui/CLAUDE.md @@ -22,6 +22,52 @@ cd external/src/ui && pnpm type-check && pnpm lint && pnpm test --run pnpm format ``` +## Reasoning Integrity: Avoiding Systematic Failure Modes + +These principles guard against how Claude tends to reason incorrectly about code changes. + +### Fix at the source — never by convergence + +When two things are inconsistent, identify which is *correct* and fix the other. Do not flatten both to a common (often weaker) state to create false consistency. + +```text +❌ A strips time from a datetime-local input → downgrade input to type="date" for "consistency" +✅ A strips time from a datetime-local input → remove the stripping; the datetime was correct +``` + +**The pattern to catch:** "These two things are inconsistent, so I'll make them both match the simpler one." This is always wrong. One side is the bug; find it. + +### Capability loss is a regression — always + +Reducing precision, expressiveness, or user capability requires explicit user approval even when it creates consistency or simplifies the implementation. This includes: + +- Input type downgrades (`datetime-local` → `date`, `number` → `text`) +- Data truncation (full ISO datetime → date-only string, float → int) +- Feature removal (range picker → single value, multi-select → single-select) +- API parameter removal or narrowing + +If the rationale for a change involves "simpler" at the cost of capability, stop and ask. + +### Apply the reversal test before citing evidence + +Before using a fact to justify a change, ask: *"Does this same evidence equally support the opposite conclusion?"* + +```text +Fact: backend accepts datetime.datetime +→ Wrong: "date-only strings work too, so use type='date'" +→ Right: "the backend supports full precision — use datetime-local and pass full timestamps" +``` + +If evidence supports both a conclusion and its opposite, it is not justifying the change — it is post-hoc rationalization. Recognizing this pattern should trigger a full re-examination of the decision. + +### Design intent vs. implementation bug: assume capability when uncertain + +When implementation looks inconsistent (e.g., `datetime-local` input but time is then stripped), one side is correct intent and the other is the bug. **Default assumption: the richer/more capable side is the intent, the lossy transformation is the bug.** If genuinely uncertain, ask the user — do not silently resolve the ambiguity by picking the simpler option. + +### Post-hoc rationalization: when evidence confirms a prior decision, be suspicious + +Evidence found *after* a decision that *perfectly supports* it is a red flag. When you notice you are building a case for something already decided, explicitly argue the opposite before proceeding. + ## Development Commands ```bash @@ -51,7 +97,7 @@ pnpm generate-api # Regenerate from backend OpenAPI spec ## Architecture: The Critical Layer Pattern -``` +```text Page → Headless Hook → Adapter Hook → Generated API → Backend ↓ Themed Components diff --git a/src/ui/next.config.ts b/src/ui/next.config.ts index 630f1f456..d65685c4e 100644 --- a/src/ui/next.config.ts +++ b/src/ui/next.config.ts @@ -204,6 +204,9 @@ const nextConfig: NextConfig = { // Dataset file proxy route - alias to production version (zero mock code) "@/app/proxy/dataset/file/route.impl": "@/app/proxy/dataset/file/route.impl.production", + + // Auth server utilities - alias to production version (zero env fallbacks) + "@/lib/auth/server": "@/lib/auth/server.production", } : {}, }, diff --git a/src/ui/package.json b/src/ui/package.json index 89085889b..42f3a8c6c 100644 --- a/src/ui/package.json +++ b/src/ui/package.json @@ -65,7 +65,7 @@ "@replit/codemirror-indentation-markers": "^6.5.3", "@tanstack/react-query": "^5.90.21", "@tanstack/react-table": "^8.21.3", - "@tanstack/react-virtual": "3.13.12", + "@tanstack/react-virtual": "^3.13.21", "@uiw/react-codemirror": "^4.25.4", "@use-gesture/react": "^10.3.1", "@xterm/addon-fit": "^0.11.0", diff --git a/src/ui/pnpm-lock.yaml b/src/ui/pnpm-lock.yaml index 47ff7bbf0..f4b8bd662 100644 --- a/src/ui/pnpm-lock.yaml +++ b/src/ui/pnpm-lock.yaml @@ -108,8 +108,8 @@ importers: specifier: ^8.21.3 version: 8.21.3(react-dom@19.2.3(react@19.2.3))(react@19.2.3) '@tanstack/react-virtual': - specifier: 3.13.12 - version: 3.13.12(react-dom@19.2.3(react@19.2.3))(react@19.2.3) + specifier: ^3.13.21 + version: 3.13.21(react-dom@19.2.3(react@19.2.3))(react@19.2.3) '@uiw/react-codemirror': specifier: ^4.25.4 version: 4.25.4(@babel/runtime@7.28.6)(@codemirror/autocomplete@6.20.0)(@codemirror/language@6.12.1)(@codemirror/lint@6.9.4)(@codemirror/search@6.6.0)(@codemirror/state@6.5.4)(@codemirror/theme-one-dark@6.1.3)(@codemirror/view@6.39.15)(codemirror@6.0.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3) @@ -2052,8 +2052,8 @@ packages: react: '>=16.8' react-dom: '>=16.8' - '@tanstack/react-virtual@3.13.12': - resolution: {integrity: sha512-Gd13QdxPSukP8ZrkbgS2RwoZseTTbQPLnQEn7HY/rqtM+8Zt95f7xKC7N0EsKs7aoz0WzZ+fditZux+F8EzYxA==} + '@tanstack/react-virtual@3.13.21': + resolution: {integrity: sha512-SYXFrmrbPgXBvf+HsOsKhFgqSe4M6B29VHOsX9Jih9TlNkNkDWx0hWMiMLUghMEzyUz772ndzdEeCEBx+3GIZw==} peerDependencies: react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 react-dom: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -2062,8 +2062,8 @@ packages: resolution: {integrity: sha512-ldZXEhOBb8Is7xLs01fR3YEc3DERiz5silj8tnGkFZytt1abEvl/GhUmCE0PMLaMPTa3Jk4HbKmRlHmu+gCftg==} engines: {node: '>=12'} - '@tanstack/virtual-core@3.13.12': - resolution: {integrity: sha512-1YBOJfRHV4sXUmWsFSf5rQor4Ss82G8dQWLRbnk3GA4jeP8hQt1hxXh0tmflpC0dz3VgEv/1+qwPyLeWkQuPFA==} + '@tanstack/virtual-core@3.13.21': + resolution: {integrity: sha512-ww+fmLHyCbPSf7JNbWZP3g7wl6SdNo3ah5Aiw+0e9FDErkVHLKprYUrwTm7dF646FtEkN/KkAKPYezxpmvOjxw==} '@tybys/wasm-util@0.10.1': resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==} @@ -6972,15 +6972,15 @@ snapshots: react: 19.2.3 react-dom: 19.2.3(react@19.2.3) - '@tanstack/react-virtual@3.13.12(react-dom@19.2.3(react@19.2.3))(react@19.2.3)': + '@tanstack/react-virtual@3.13.21(react-dom@19.2.3(react@19.2.3))(react@19.2.3)': dependencies: - '@tanstack/virtual-core': 3.13.12 + '@tanstack/virtual-core': 3.13.21 react: 19.2.3 react-dom: 19.2.3(react@19.2.3) '@tanstack/table-core@8.21.3': {} - '@tanstack/virtual-core@3.13.12': {} + '@tanstack/virtual-core@3.13.21': {} '@tybys/wasm-util@0.10.1': dependencies: diff --git a/src/ui/src/app/(dashboard)/occupancy/occupancy-page-content.tsx b/src/ui/src/app/(dashboard)/occupancy/occupancy-page-content.tsx index be7cabacf..1f57ea943 100644 --- a/src/ui/src/app/(dashboard)/occupancy/occupancy-page-content.tsx +++ b/src/ui/src/app/(dashboard)/occupancy/occupancy-page-content.tsx @@ -14,15 +14,6 @@ //SPDX-License-Identifier: Apache-2.0 -/** - * Occupancy Page Content (Client Component) - * - * Layout: Toolbar → Summary cards → Collapsible-row table - * - * Data source: GET /api/task?summary=true → aggregated by user or pool. - * All aggregation is client-side (shim) until backend ships group_by pagination (Issue #23). - */ - "use client"; import { useMemo, useCallback, useState } from "react"; @@ -30,7 +21,8 @@ import { useQueryState, parseAsStringLiteral } from "nuqs"; import { InlineErrorBoundary } from "@/components/error/inline-error-boundary"; import { usePage } from "@/components/chrome/page-context"; import { useResultsCount } from "@/components/filter-bar/hooks/use-results-count"; -import { useUrlChips } from "@/components/filter-bar/hooks/use-url-chips"; +import { useDefaultFilter } from "@/components/filter-bar/hooks/use-default-filter"; +import { TASK_STATE_CATEGORIES } from "@/lib/task-group-status-presets"; import { OccupancyToolbar } from "@/features/occupancy/components/occupancy-toolbar"; import { OccupancySummary } from "@/features/occupancy/components/occupancy-summary"; import { OccupancyDataTable } from "@/features/occupancy/components/occupancy-data-table"; @@ -38,43 +30,26 @@ import { useOccupancyData } from "@/features/occupancy/hooks/use-occupancy-data" import { useOccupancyTableStore } from "@/features/occupancy/stores/occupancy-table-store"; import type { OccupancyGroupBy, OccupancySortBy } from "@/lib/api/adapter/occupancy"; -// ============================================================================= -// GroupBy parser for URL state -// ============================================================================= - -const GROUP_BY_VALUES = ["user", "pool"] as const; +const GROUP_BY_VALUES = ["pool", "user"] as const; const parseAsGroupBy = parseAsStringLiteral(GROUP_BY_VALUES); -// ============================================================================= -// Component -// ============================================================================= - export function OccupancyPageContent() { usePage({ title: "Occupancy" }); - // ========================================================================== - // URL State - // ========================================================================== - const [groupBy, setGroupBy] = useQueryState( "groupBy", - parseAsGroupBy.withDefault("user").withOptions({ shallow: true, history: "replace", clearOnDefault: true }), + parseAsGroupBy.withDefault("pool").withOptions({ shallow: true, history: "replace", clearOnDefault: true }), ); - const { searchChips, setSearchChips } = useUrlChips(); - - // ========================================================================== - // Sort state from table store - // ========================================================================== + const { effectiveChips: searchChips, handleChipsChange: setSearchChips } = useDefaultFilter({ + field: "status", + defaultValue: TASK_STATE_CATEGORIES.running, + }); const sortState = useOccupancyTableStore((s) => s.sort); const sortBy: OccupancySortBy = (sortState?.column as OccupancySortBy) ?? "gpu"; const order: "asc" | "desc" = sortState?.direction ?? "desc"; - // ========================================================================== - // Data - // ========================================================================== - const { groups, totals, isLoading, error, refetch, truncated } = useOccupancyData({ groupBy, sortBy, @@ -82,10 +57,6 @@ export function OccupancyPageContent() { searchChips, }); - // ========================================================================== - // Toolbar props - // ========================================================================== - const resultsCount = useResultsCount({ total: groups.length, filteredTotal: groups.length, @@ -99,11 +70,7 @@ export function OccupancyPageContent() { [setGroupBy], ); - // ========================================================================== - // Expand/collapse state — lifted here so toolbar can drive expand-all/collapse-all. - // Reset when groupBy changes (stale keys from old view are meaningless). - // ========================================================================== - + // Expand/collapse state resets when groupBy changes (stale keys are meaningless) const [expandedState, setExpandedState] = useState<{ groupBy: OccupancyGroupBy; keys: Set }>({ groupBy, keys: new Set(), @@ -136,13 +103,8 @@ export function OccupancyPageContent() { const allExpanded = groups.length > 0 && expandedKeys.size === groups.length; - // ========================================================================== - // Render - // ========================================================================== - return (
- {/* Toolbar */}
- {/* KPI summary cards */}
- {/* Scale limit warning */} {truncated && (
Results may be incomplete — reached the 10,000 row fetch limit. Backend group_by pagination (Issue #23) is @@ -185,7 +145,6 @@ export function OccupancyPageContent() {
)} - {/* Main table */}
{ onRowDoubleClick?: (row: TData) => void; /** For middle-click: returns URL for new tab, or undefined to call onRowClick */ getRowHref?: (row: TData) => string | undefined; + /** Native tooltip (title attribute) for a row, shown on hover */ + getRowTitle?: (row: TData) => string | undefined; selectedRowId?: string; /** Override default header cell padding/styling for all columns (default: "px-4 py-3") */ headerClassName?: string; /** Extra classes applied to the element (e.g. "file-browser-thead" for a shadow-based bottom divider) */ theadClassName?: string; - rowClassName?: string | ((item: TData) => string); + rowClassName?: string | ((item: TData, index: number) => string); sectionClassName?: string | ((section: Section) => string); + /** Whether a given row should show hover/pointer styles. Defaults to !!onRowClick for all rows. */ + isRowInteractive?: (row: TData) => boolean; columnSizeConfigs?: readonly ColumnSizeConfig[]; columnSizingPreferences?: ColumnSizingPreferences; onColumnSizingPreferenceChange?: (columnId: string, preference: ColumnSizingPreference) => void; @@ -128,7 +132,6 @@ function DataTableInner({ onLoadMore, isFetchingNextPage, totalCount, - // Row heights use canonical constants from @/lib/config rowHeight = TABLE_ROW_HEIGHTS.NORMAL, sectionHeight = TABLE_ROW_HEIGHTS.SECTION, className, @@ -140,6 +143,7 @@ function DataTableInner({ onFocusedRowChange, onRowDoubleClick, getRowHref, + getRowTitle, selectedRowId, headerClassName: tableHeaderClassName, theadClassName, @@ -151,6 +155,7 @@ function DataTableInner({ suspendResize, resizeCompleteEvent, registerLayoutStableCallback, + isRowInteractive, }: DataTableProps) { const scrollRef = useRef(null); const tableElementRef = useRef(null); @@ -178,7 +183,6 @@ function DataTableInner({ return columns .map((c) => { if (typeof c.id === "string") return c.id; - // AccessorKeyColumnDef has accessorKey property if ("accessorKey" in c && c.accessorKey) return String(c.accessorKey); return ""; }) @@ -199,38 +203,20 @@ function DataTableInner({ const visibleColumnCount = visibleColumnIds.length; - const columnMinSizes = useMemo(() => { - const sizes: Record = {}; - for (const col of columns) { - const colId = col.id ?? ("accessorKey" in col && col.accessorKey ? String(col.accessorKey) : ""); - if (colId && col.minSize != null) { - sizes[colId] = col.minSize; - } - } - return sizes; - }, [columns]); + const { columnMinSizes, columnInitialSizes, columnResizability } = useMemo(() => { + const mins: Record = {}; + const initials: Record = {}; + const resizability: Record = {}; - const columnInitialSizes = useMemo(() => { - const sizes: Record = {}; for (const col of columns) { const colId = col.id ?? ("accessorKey" in col && col.accessorKey ? String(col.accessorKey) : ""); - if (colId && col.size != null) { - sizes[colId] = col.size; - } + if (!colId) continue; + if (col.minSize != null) mins[colId] = col.minSize; + if (col.size != null) initials[colId] = col.size; + resizability[colId] = col.enableResizing !== false; } - return sizes; - }, [columns]); - const columnResizability = useMemo(() => { - const resizability: Record = {}; - for (const col of columns) { - const colId = col.id ?? ("accessorKey" in col && col.accessorKey ? String(col.accessorKey) : ""); - if (colId) { - // enableResizing defaults to true if not specified - resizability[colId] = col.enableResizing !== false; - } - } - return resizability; + return { columnMinSizes: mins, columnInitialSizes: initials, columnResizability: resizability }; }, [columns]); const showSkeleton = isLoading && allItems.length === 0; @@ -252,19 +238,34 @@ function DataTableInner({ registerLayoutStableCallback, }); - // Track previous data length to detect empty → populated transitions + // Toggle `is-scrolling` class to suppress row-position transitions during scroll. + // Removed 150ms after the last scroll event so expand/collapse animations work. + useEffect(() => { + const scrollEl = scrollRef.current; + if (!scrollEl) return; + let timeoutId: ReturnType; + const onScroll = () => { + scrollEl.classList.add("is-scrolling"); + clearTimeout(timeoutId); + timeoutId = setTimeout(() => scrollEl.classList.remove("is-scrolling"), 150); + }; + scrollEl.addEventListener("scroll", onScroll, { passive: true }); + return () => { + scrollEl.removeEventListener("scroll", onScroll); + clearTimeout(timeoutId); + }; + }, []); + + // Recalculate column widths when data first arrives (empty -> populated) const prevDataLength = usePrevious(allItems.length); - - // When transitioning from empty (0 items) to populated (>0 items), - // trigger column recalculation to ensure columns fill available space + const { recalculate } = columnSizingHook; useEffect(() => { if (prevDataLength === 0 && allItems.length > 0) { - // Use RAF to ensure DOM has updated before recalculating requestAnimationFrame(() => { - columnSizingHook.recalculate(); + recalculate(); }); } - }, [prevDataLength, allItems.length, columnSizingHook]); + }, [prevDataLength, allItems.length, recalculate]); // eslint-disable-next-line react-hooks/incompatible-library -- TanStack Table returns unstable functions by design const table = useReactTable({ @@ -316,7 +317,6 @@ function DataTableInner({ useVirtualizedTable({ items: sections ? undefined : data, sections, - getRowId, scrollRef, rowHeight, sectionHeight, @@ -369,7 +369,9 @@ function DataTableInner({ const rowNavigation = useRowNavigation({ rowCount: virtualItemCount, - visibleRowCount: Math.floor(600 / rowHeight), + visibleRowCount: scrollRef.current + ? Math.max(1, Math.floor(scrollRef.current.clientHeight / rowHeight)) + : Math.floor(600 / rowHeight), onRowActivate: useCallback( (virtualIndex: number) => { const item = getItem(virtualIndex); @@ -511,7 +513,6 @@ function DataTableInner({ const colIndex = headerIndex + 1; - // Get custom header className from column meta (dependency injection) const headerClassName = header.column.columnDef.meta?.headerClassName; if (isFixed) { @@ -567,6 +568,7 @@ function DataTableInner({ onRowClick={onRowClick} onRowDoubleClick={onRowDoubleClick} getRowHref={getRowHref} + getRowTitle={getRowTitle} selectedRowId={selectedRowId} getRowId={getRowId} rowClassName={rowClassName} @@ -577,6 +579,7 @@ function DataTableInner({ onRowKeyDown={rowNavigation.handleRowKeyDown} measureElement={measureElement} compact={compact} + isRowInteractive={isRowInteractive} /> @@ -635,5 +638,4 @@ function DataTableInner({ ); } -// Memoize with generic type preservation export const DataTable = memo(DataTableInner) as typeof DataTableInner; diff --git a/src/ui/src/components/data-table/hooks/use-column-sizing.ts b/src/ui/src/components/data-table/hooks/use-column-sizing.ts index 01d7f8d3d..43a5ca9dc 100644 --- a/src/ui/src/components/data-table/hooks/use-column-sizing.ts +++ b/src/ui/src/components/data-table/hooks/use-column-sizing.ts @@ -269,6 +269,8 @@ export function useColumnSizing({ // Track pending idle callback for cleanup const pendingIdleCallbackRef = useRef | null>(null); + // Track which scheduler was used so cleanup uses the correct cancel API + const useRicRef = useRef(typeof requestIdleCallback !== "undefined"); // Remeasure when a column becomes NO_TRUNCATE useEffect(() => { @@ -305,7 +307,11 @@ export function useColumnSizing({ } }; - if (typeof requestIdleCallback !== "undefined") { + // Capture the scheduler flag at effect setup time so the cleanup function + // can use the same value (lint: ref values may change by cleanup time). + const useRic = useRicRef.current; + + if (useRic) { pendingIdleCallbackRef.current = requestIdleCallback(measureNewColumns, { timeout: 500 }); } else { // Safari fallback: Use RAF to ensure DOM measurement happens at optimal time @@ -317,13 +323,11 @@ export function useColumnSizing({ return () => { if (pendingIdleCallbackRef.current !== null) { - if (typeof cancelIdleCallback !== "undefined" && typeof pendingIdleCallbackRef.current === "number") { - cancelIdleCallback(pendingIdleCallbackRef.current); - } else if (typeof cancelAnimationFrame !== "undefined") { + if (useRic) { + cancelIdleCallback(pendingIdleCallbackRef.current as number); + } else { // Safari fallback uses RAF, so cancel with cancelAnimationFrame cancelAnimationFrame(pendingIdleCallbackRef.current as number); - } else { - clearTimeout(pendingIdleCallbackRef.current as ReturnType); } pendingIdleCallbackRef.current = null; } diff --git a/src/ui/src/components/data-table/hooks/use-virtualized-table.ts b/src/ui/src/components/data-table/hooks/use-virtualized-table.ts index 3a553f8ee..f9c636ab3 100644 --- a/src/ui/src/components/data-table/hooks/use-virtualized-table.ts +++ b/src/ui/src/components/data-table/hooks/use-virtualized-table.ts @@ -14,28 +14,14 @@ // // SPDX-License-Identifier: Apache-2.0 -/** - * Hook for virtualized table rendering. - * - * Wraps TanStack Virtual to work with native elements. - * Provides row indices, positions, and infinite scroll detection. - * - * Note: Requires @tanstack/react-virtual 3.13.12 (not 3.13.13+) to avoid - * flushSync warnings during render. See https://github.com/TanStack/virtual/issues/1094 - */ - "use client"; -import { useCallback, useEffect, useMemo, useRef } from "react"; -import { useVirtualizer } from "@tanstack/react-virtual"; +import { useCallback, useEffect, useMemo, useReducer, useRef } from "react"; +import { useVirtualizer, type Virtualizer } from "@tanstack/react-virtual"; import { useSyncedRef, usePrevious } from "@react-hookz/web"; import type { Section } from "@/components/data-table/types"; import { VirtualItemTypes } from "@/components/data-table/constants"; -// ============================================================================= -// Types -// ============================================================================= - export interface VirtualizedRow { index: number; start: number; @@ -48,10 +34,8 @@ export interface UseVirtualizedTableOptions { items?: T[]; /** Sectioned data (mutually exclusive with items) */ sections?: Section[]; - getRowId: (item: T) => string; scrollRef: React.RefObject; rowHeight: number; - /** Section header height (required if using sections) */ sectionHeight?: number; overscan?: number; hasNextPage?: boolean; @@ -62,9 +46,7 @@ export interface UseVirtualizedTableOptions { export interface UseVirtualizedTableResult { virtualRows: VirtualizedRow[]; totalHeight: number; - /** Total data row count (excluding section headers, for aria-rowcount) */ totalRowCount: number; - /** Total virtual item count (sections + data rows, for navigation) */ virtualItemCount: number; getItem: ( index: number, @@ -74,18 +56,12 @@ export interface UseVirtualizedTableResult { | null; measure: () => void; scrollToIndex: (index: number, options?: { align?: "start" | "center" | "end" | "auto" }) => void; - /** Ref callback for dynamic row measurement - attach to row elements */ measureElement: (node: Element | null) => void; } -// ============================================================================= -// Hook -// ============================================================================= - export function useVirtualizedTable({ items, sections, - getRowId, scrollRef, rowHeight, sectionHeight = 36, @@ -94,7 +70,6 @@ export function useVirtualizedTable({ onLoadMore, isFetchingNextPage = false, }: UseVirtualizedTableOptions): UseVirtualizedTableResult { - // Build flat list of virtual items (sections + rows or just rows) const virtualItems = useMemo(() => { if (sections && sections.length > 0) { const result: Array< @@ -103,8 +78,7 @@ export function useVirtualizedTable({ > = []; for (const section of sections) { - // Use height 0 for sections with skipGroupRow (e.g., single-task groups) - // so they don't allocate vertical space in the virtual list + // Height 0 for skipped headers so they don't allocate vertical space const metadata = section.metadata as { skipGroupRow?: boolean } | undefined; const shouldSkipHeader = metadata?.skipGroupRow === true; const headerHeight = shouldSkipHeader ? 0 : sectionHeight; @@ -129,31 +103,30 @@ export function useVirtualizedTable({ return []; }, [items, sections, rowHeight, sectionHeight]); - // Stable refs for accessing changing data in stable callbacks - // Note: Use useSyncedRef (not useEventCallback) for getRowId because it's called - // during render by getItemKey. useEventCallback throws when called during render. const virtualItemsRef = useSyncedRef(virtualItems); - const getRowIdRef = useSyncedRef(getRowId); + const virtualizerRef = useRef | null>(null); + + // Dispatched from TV's onChange(sync=false); captured by flushSync in makeRowRef + // to commit row positions synchronously before paint. + const [, forceSyncTick] = useReducer((x: number) => x + 1, 0); - // Estimate size function - stable callback using ref const estimateSize = useCallback( (index: number) => virtualItemsRef.current[index]?.height ?? rowHeight, [virtualItemsRef, rowHeight], ); - // Get item key - stable callback using refs - // Called during render by virtualizer, so must use refs (not useEventCallback) + // Uses virtual index (not item ID) so duplicate items (e.g. a resource in + // multiple pools) get independent entries in TV's itemSizeCache. const getItemKey = useCallback( (index: number) => { const item = virtualItemsRef.current[index]; if (!item) return index; if (item.type === VirtualItemTypes.SECTION) return `section-${item.section.id}`; - return getRowIdRef.current(item.item); + return index; }, - [virtualItemsRef, getRowIdRef], + [virtualItemsRef], ); - // Create virtualizer with stable callbacks // eslint-disable-next-line react-hooks/incompatible-library -- TanStack Virtual returns unstable functions by design. React Compiler skips optimization. See: https://github.com/facebook/react/issues/33057 const virtualizer = useVirtualizer({ count: virtualItems.length, @@ -161,30 +134,36 @@ export function useVirtualizedTable({ estimateSize, overscan, getItemKey, + onChange: (_instance, sync) => { + // Only for resize events (sync=false), not scroll updates + if (!sync) { + forceSyncTick(); + } + }, }); - // Re-measure when heights change + virtualizerRef.current = virtualizer; + + const measure = useCallback(() => { + virtualizerRef.current?.measure(); + }, []); + useEffect(() => { virtualizer.measure(); }, [rowHeight, sectionHeight, virtualizer]); - // Get virtual items - TanStack Virtual handles reactivity internally - const virtualRows: VirtualizedRow[] = virtualizer.getVirtualItems().map((row) => ({ + const rawVirtualItems = virtualizer.getVirtualItems(); + const virtualRows: VirtualizedRow[] = rawVirtualItems.map((row) => ({ index: row.index, start: row.start, size: row.size, key: String(row.key), })); - // Stable ref for optional load more callback const onLoadMoreRef = useSyncedRef(onLoadMore); - - // Track if we've already triggered load more to prevent duplicate calls const loadMoreTriggeredRef = useRef(false); - // Reset the trigger flag when new data arrives (item count changes). - // This single mechanism works for both slow network responses and instant - // cached responses, making the logic agnostic of data source timing. + // Reset when new data arrives so another page can be requested const currentItemCount = items?.length ?? 0; const prevItemCount = usePrevious(currentItemCount); useEffect(() => { @@ -199,20 +178,16 @@ export function useVirtualizedTable({ const scrollElement = scrollRef.current; if (!scrollElement) return; - // Track if effect is still active to prevent stale callback execution let isActive = true; const checkLoadMore = () => { - // Guard against stale execution after unmount if (!isActive) return; - if (loadMoreTriggeredRef.current) return; const rows = virtualizer.getVirtualItems(); const lastRow = rows.at(-1); if (!lastRow) return; - // Trigger load when within threshold items of end (balances UX with network efficiency) const LOAD_MORE_THRESHOLD = 10; if (lastRow.index >= virtualItems.length - LOAD_MORE_THRESHOLD) { loadMoreTriggeredRef.current = true; @@ -222,7 +197,6 @@ export function useVirtualizedTable({ scrollElement.addEventListener("scroll", checkLoadMore, { passive: true }); - // Initial check after layout settles (100ms is typical for initial render) const INITIAL_CHECK_DELAY_MS = 100; const timeoutId = setTimeout(checkLoadMore, INITIAL_CHECK_DELAY_MS); @@ -233,7 +207,6 @@ export function useVirtualizedTable({ }; }, [scrollRef, virtualItems.length, hasNextPage, isFetchingNextPage, virtualizer, onLoadMoreRef]); - // Get item for a virtual row index const getItem = useCallback( (index: number) => { const item = virtualItems[index]; @@ -246,7 +219,6 @@ export function useVirtualizedTable({ [virtualItems], ); - // Count total data rows (excluding section headers) const totalRowCount = useMemo(() => { if (sections) { return sections.reduce((sum, s) => sum + s.items.length, 0); @@ -254,12 +226,11 @@ export function useVirtualizedTable({ return items?.length ?? 0; }, [items, sections]); - // Scroll to a specific index const scrollToIndex = useCallback( (index: number, options?: { align?: "start" | "center" | "end" | "auto" }) => { virtualizer.scrollToIndex(index, { align: options?.align ?? "auto", - behavior: "auto", // instant for keyboard nav + behavior: "auto", }); }, [virtualizer], @@ -271,7 +242,7 @@ export function useVirtualizedTable({ totalRowCount, virtualItemCount: virtualItems.length, getItem, - measure: () => virtualizer.measure(), + measure, scrollToIndex, measureElement: virtualizer.measureElement, }; diff --git a/src/ui/src/components/data-table/styles.css b/src/ui/src/components/data-table/styles.css index 0dca5213c..336a11311 100644 --- a/src/ui/src/components/data-table/styles.css +++ b/src/ui/src/components/data-table/styles.css @@ -57,6 +57,10 @@ align-items: stretch; will-change: transform; backface-visibility: hidden; + /* Smooth repositioning when a preceding row changes height (e.g. ExpandableChips expand). + Disabled during active scroll (.is-scrolling) to prevent virtualized rows from animating + as they enter the viewport — only expand/collapse-driven position changes are animated. */ + transition: transform var(--duration-normal) var(--ease-out); /* Subtle fade-in prevents jarring "pop" when rows appear during fast scroll */ animation: row-fade-in var(--duration-quick) var(--ease-out); @@ -101,6 +105,21 @@ padding: 0; } +/* Suppress position transition during active scroll — virtualized rows must snap to position + when entering/leaving the viewport. The .is-scrolling class is toggled by DataTable. */ +.is-scrolling .data-table-row { + transition: none; +} + +/* Interactive rows: pointer cursor + hover highlight */ +.data-table-row[data-interactive] { + cursor: pointer; +} + +.data-table-row[data-interactive]:hover { + @apply bg-zinc-50 dark:bg-zinc-900; +} + /* Focus state for group rows */ .data-table-section-row:focus-visible { @apply outline-ring z-1 outline-2; diff --git a/src/ui/src/components/data-table/virtual-table-body.tsx b/src/ui/src/components/data-table/virtual-table-body.tsx index 25c6150bb..758ebfe04 100644 --- a/src/ui/src/components/data-table/virtual-table-body.tsx +++ b/src/ui/src/components/data-table/virtual-table-body.tsx @@ -14,16 +14,11 @@ // // SPDX-License-Identifier: Apache-2.0 -/** - * VirtualTableBody - * - * Virtualized using native table elements. - * Renders only visible rows with absolute positioning for performance. - */ - "use client"; import { memo, useCallback, useRef } from "react"; +import { flushSync } from "react-dom"; +import { useSyncedRef } from "@react-hookz/web"; import { flexRender, type Row } from "@tanstack/react-table"; import { cn } from "@/lib/utils"; import type { VirtualizedRow } from "@/components/data-table/hooks/use-virtualized-table"; @@ -31,60 +26,34 @@ import type { Section } from "@/components/data-table/types"; import { getColumnCSSValue } from "@/components/data-table/utils/column-sizing"; import { VirtualItemTypes } from "@/components/data-table/constants"; -// ============================================================================= -// Types -// ============================================================================= - export interface VirtualTableBodyProps { - /** Virtual rows to render */ virtualRows: VirtualizedRow[]; - /** Total height of all rows */ totalHeight: number; - /** Get table row by virtual index */ getTableRow: (index: number) => Row | undefined; - /** Get item info by virtual index (for sections) */ getItem: ( index: number, ) => { type: "section"; section: Section } | { type: "row"; item: TData } | null; - /** Number of columns (for section header colSpan) */ columnCount: number; - /** Row click handler */ onRowClick?: (item: TData, index: number) => void; - /** Row double-click handler */ onRowDoubleClick?: (item: TData, index: number) => void; - /** - * Get the href for a row (if clicking navigates to a page). - * Used for middle-click behavior: - * - If getRowHref returns a URL → middle-click opens in new tab - * - If getRowHref returns undefined or is not provided → middle-click calls onRowClick (shows overlay) - */ + /** Returns URL for middle-click "open in new tab", or undefined to fall back to onRowClick */ getRowHref?: (item: TData) => string | undefined; - /** Selected row ID */ + /** Native tooltip (title attribute) for a row, shown on hover */ + getRowTitle?: (item: TData) => string | undefined; selectedRowId?: string; - /** Get row ID for comparison */ getRowId?: (item: TData) => string; - /** Custom row class name */ - rowClassName?: string | ((item: TData) => string); - /** Custom section row class name (for zebra striping, borders, etc.) */ + rowClassName?: string | ((item: TData, index: number) => string); sectionClassName?: string | ((section: Section) => string); - /** Render custom section header */ renderSectionHeader?: (section: Section) => React.ReactNode; - /** Get tabIndex for a row (roving tabindex pattern) */ getRowTabIndex?: (index: number) => 0 | -1; - /** Row focus handler */ onRowFocus?: (index: number) => void; - /** Row keydown handler */ onRowKeyDown?: (e: React.KeyboardEvent, index: number) => void; - /** Ref callback for dynamic row measurement */ measureElement?: (node: Element | null) => void; - /** Compact mode - reduces cell padding */ compact?: boolean; + /** Whether a given row should show hover/pointer styles. Defaults to !!onRowClick for all rows. */ + isRowInteractive?: (row: TData) => boolean; } -// ============================================================================= -// Component -// ============================================================================= - function VirtualTableBodyInner({ virtualRows, totalHeight, @@ -94,6 +63,7 @@ function VirtualTableBodyInner({ onRowClick, onRowDoubleClick, getRowHref, + getRowTitle, selectedRowId, getRowId, rowClassName, @@ -104,20 +74,32 @@ function VirtualTableBodyInner({ onRowKeyDown, measureElement, compact = false, + isRowInteractive, }: VirtualTableBodyProps) { - /** - * Stores the row item resolved at mousedown time (first click of a potential - * double-click). Used as a fallback in handleTbodyDoubleClick when the - * virtualizer has remounted rows between the first click and the dblclick - * event (e.g. because opening a slideout panel narrowed the table). - */ + // Fallback for dblclick when virtualizer remounts rows between clicks const lastMouseDownItemRef = useRef<{ item: TData; index: number } | null>(null); - /** - * Resolve row data from a delegated event target. - * Walks up the DOM to find the row element, parses its index, - * and returns the row item if it's a data row (not a section header). - */ + // Stable ref: TV recreates measureElement every render by design + const measureElementRef = useSyncedRef(measureElement); + + // Registers row with TV and attaches a supplemental ResizeObserver that + // wraps measureElement in flushSync. This captures TV's onChange(sync=false) + // dispatch synchronously, so row positions are correct before paint. + const makeRowRef = useCallback( + (node: HTMLTableRowElement | null): void | (() => void) => { + if (!node) return; + measureElementRef.current?.(node); + const observer = new ResizeObserver(() => { + flushSync(() => { + measureElementRef.current?.(node); + }); + }); + observer.observe(node, { box: "border-box" }); + return () => observer.disconnect(); + }, + [measureElementRef], + ); + const resolveRowItem = useCallback( (target: HTMLElement): { item: TData; index: number } | null => { const row = target.closest('[role="row"][data-index]'); @@ -134,13 +116,13 @@ function VirtualTableBodyInner({ (e: React.MouseEvent) => { if (!onRowClick && !getRowHref) return; - // When a double-click handler is registered, skip the second click - // (detail >= 2) of a multi-click sequence. Without this guard the second - // click fires onRowClick → startViewTransition a second time, and the - // resulting document.startViewTransition competes with the one triggered - // by the immediately-following dblclick event. The browser cannot - // reliably run two overlapping view-transition update callbacks, so the - // dblclick's router.push never executes. + // If the user dragged to select text, don't trigger navigation. + const selection = window.getSelection(); + if (selection && selection.toString().length > 0) return; + + // Skip the second click of a multi-click sequence when dblclick is handled, + // otherwise two competing startViewTransition calls cause the dblclick's + // router.push to never execute. if (onRowDoubleClick && e.detail >= 2) return; const resolved = resolveRowItem(e.target as HTMLElement); @@ -203,9 +185,6 @@ function VirtualTableBodyInner({ const handleTbodyMouseDown = useCallback( (e: React.MouseEvent) => { if (!onRowDoubleClick) return; - // Only capture on the first press of a potential double-click (detail=1). - // This is used as a fallback in handleTbodyDoubleClick in case the - // virtualizer remounts rows between the first click and the dblclick event. if (e.detail !== 1) return; lastMouseDownItemRef.current = resolveRowItem(e.target as HTMLElement); }, @@ -215,8 +194,7 @@ function VirtualTableBodyInner({ const handleTbodyDoubleClick = useCallback( (e: React.MouseEvent) => { if (!onRowDoubleClick) return; - // Fall back to the item captured at mousedown if the target element is no - // longer in the DOM (e.g. virtualizer remounted rows while panel opened). + // Fall back to mousedown capture if virtualizer remounted rows between clicks const resolved = resolveRowItem(e.target as HTMLElement) ?? lastMouseDownItemRef.current; if (!resolved) return; onRowDoubleClick(resolved.item, resolved.index); @@ -242,7 +220,6 @@ function VirtualTableBodyInner({ if (!item) return null; if (item.type === VirtualItemTypes.SECTION) { - // Render section header content const sectionContent = renderSectionHeader ? ( renderSectionHeader(item.section) ) : ( @@ -258,17 +235,14 @@ function VirtualTableBodyInner({ ); - // Skip rendering entire row if renderSectionHeader returns null - // (e.g., for single-task groups that don't need a section header) + // renderSectionHeader can return null to skip headers (e.g. single-item groups) if (sectionContent === null) { return null; } - // Calculate custom class name for section row (zebra striping, borders) const customSectionClassName = typeof sectionClassName === "function" ? sectionClassName(item.section) : sectionClassName; - // Use index in key to guarantee uniqueness in virtualized list return ( ({ className={cn("data-table-section-row sticky", customSectionClassName)} style={{ height: virtualRow.size, - // translate3d triggers GPU compositor layer for smoother animation transform: `translate3d(0, ${virtualRow.start}px, 0)`, }} > @@ -294,25 +267,26 @@ function VirtualTableBodyInner({ const rowId = getRowId?.(rowData); const isSelected = selectedRowId && rowId === selectedRowId; - const customClassName = typeof rowClassName === "function" ? rowClassName(rowData) : rowClassName; + const customClassName = + typeof rowClassName === "function" ? rowClassName(rowData, virtualRow.index) : rowClassName; - // Keyboard navigation support const tabIndex = getRowTabIndex?.(virtualRow.index) ?? (onRowClick ? 0 : undefined); + const isInteractive = isRowInteractive ? isRowInteractive(rowData) : !!onRowClick; - // Use virtual index in key to guarantee uniqueness even with duplicate data return ( ({ }} > {row.getVisibleCells().map((cell, cellIndex) => { - // Cache CSS variable string to avoid duplicate function calls const cssWidth = getColumnCSSValue(cell.column.id); - - // Get cell className from column meta (if provided). - // This allows columns to inject their styling requirements - // without VirtualTableBody needing specific knowledge of column types. const cellClassName = cell.column.columnDef.meta?.cellClassName; return ( @@ -339,13 +308,9 @@ function VirtualTableBodyInner({ style={{ width: cssWidth, minWidth: cssWidth, - flexShrink: 0, // Prevent shrinking below specified width + flexShrink: 0, }} - className={cn( - "flex items-center", - // Use custom className if provided, otherwise apply default padding - cellClassName ?? (compact ? "px-4 py-1.5" : "px-4 py-3"), - )} + className={cn("flex items-center", cellClassName ?? (compact ? "px-4 py-1.5" : "px-4 py-3"))} > {flexRender(cell.column.columnDef.cell, cell.getContext())} @@ -358,5 +323,4 @@ function VirtualTableBodyInner({ ); } -// Memo with generic support export const VirtualTableBody = memo(VirtualTableBodyInner) as typeof VirtualTableBodyInner; diff --git a/src/ui/src/components/filter-bar/filter-bar-date-picker.tsx b/src/ui/src/components/filter-bar/filter-bar-date-picker.tsx new file mode 100644 index 000000000..1fa9d6dfe --- /dev/null +++ b/src/ui/src/components/filter-bar/filter-bar-date-picker.tsx @@ -0,0 +1,219 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * FilterBarDatePicker - Date/range picker panel rendered inside the FilterBar dropdown. + * + * B1 "Split Rail" layout: preset list on the left with active indicator, + * stacked From/To date inputs on the right. + * + * Selecting a preset or applying custom dates calls onCommit(value) where value + * is either a preset label ("last 7 days"), a single ISO date ("2026-03-11"), + * or an ISO range ("2026-03-01..2026-03-11"). + */ + +"use client"; + +import { useState, useCallback, useMemo, memo, useRef, useEffect } from "react"; +import { DATE_RANGE_PRESETS } from "@/lib/date-range-utils"; +import { DATE_CUSTOM_FROM, DATE_CUSTOM_TO, DATE_CUSTOM_APPLY } from "@/components/filter-bar/lib/types"; +import { MONTHS_SHORT } from "@/lib/format-date"; + +interface FilterBarDatePickerProps { + /** Called when a date or range is committed. Value is preset label, ISO date, or ISO range. */ + onCommit: (value: string) => void; + /** Preset label currently highlighted via keyboard navigation (shows active indicator). */ + highlightedLabel?: string; + /** Called when Tab/Shift-Tab should wrap the cycle (e.g. Tab past Apply, Shift-Tab on From). */ + onCycleStep?: (direction: "forward" | "backward", fromValue: string) => void; +} + +/** Format a UTC YYYY-MM-DD string as "Mar 4" or "Mar 4 '25" (if year differs from currentYear). */ +function fmtUtcDate(isoDate: string, currentYear: number): string { + const d = new Date(`${isoDate}T00:00:00Z`); + const mon = MONTHS_SHORT[d.getUTCMonth()]; + const day = d.getUTCDate(); + const year = d.getUTCFullYear(); + return year !== currentYear ? `${mon} ${day} '${String(year).slice(2)}` : `${mon} ${day}`; +} + +/** + * Build hint text from the raw preset value (before next-midnight adjustment). + * Single date → "Mar 11"; range → "Mar 4 – Mar 11". + */ +function buildPresetHint(rawValue: string, currentYear: number): string { + if (rawValue.includes("..")) { + const sep = rawValue.indexOf(".."); + return `${fmtUtcDate(rawValue.slice(0, sep), currentYear)} – ${fmtUtcDate(rawValue.slice(sep + 2), currentYear)}`; + } + return fmtUtcDate(rawValue, currentYear); +} + +export const FilterBarDatePicker = memo(function FilterBarDatePicker({ + onCommit, + highlightedLabel, + onCycleStep, +}: FilterBarDatePickerProps) { + const [fromDate, setFromDate] = useState(""); + const [toDate, setToDate] = useState(""); + + const fromRef = useRef(null); + const toRef = useRef(null); + const applyRef = useRef(null); + + // When keyboard navigation highlights a custom input sentinel, move DOM focus there. + // For the To input, also open the calendar picker — programmatic focus() always lands + // at the first sub-field (MM), but entering backward should start at the calendar end. + useEffect(() => { + if (highlightedLabel === DATE_CUSTOM_FROM) { + fromRef.current?.focus(); + } else if (highlightedLabel === DATE_CUSTOM_TO) { + toRef.current?.focus(); + } else if (highlightedLabel === DATE_CUSTOM_APPLY) { + applyRef.current?.focus(); + } + }, [highlightedLabel]); + + // Compute once per render (client-only component, only mounted on interaction). + const currentYear = useMemo(() => new Date().getUTCFullYear(), []); + + const presetHints = useMemo( + () => Object.fromEntries(DATE_RANGE_PRESETS.map((p) => [p.label, buildPresetHint(p.getValue(), currentYear)])), + [currentYear], + ); + + // toDate must be strictly after fromDate (same minute = zero-second window after +1min adjustment) + const rangeError = !!fromDate && !!toDate && toDate <= fromDate; + + const handleApply = useCallback(() => { + if (!fromDate || rangeError) return; + if (toDate) { + onCommit(`${fromDate}..${toDate}`); + } else { + onCommit(fromDate); + } + }, [fromDate, toDate, rangeError, onCommit]); + + const handleFromChange = useCallback((value: string) => { + setFromDate(value); + // Clear "to" if it's now at or before "from" (equal = invalid range after +1min adjustment) + setToDate((prev) => (prev && prev <= value ? "" : prev)); + }, []); + + return ( +
e.stopPropagation()} + > +
+ {/* Left rail: presets with right-aligned date hints */} +
+
Presets
+ {DATE_RANGE_PRESETS.map((preset) => ( + + ))} +
+ + {/* Right: custom range */} +
+
Custom range
+
+ + handleFromChange(e.target.value)} + className="fb-date-input" + /> +
+
+ + setToDate(e.target.value)} + min={fromDate || undefined} + className="fb-date-input" + data-error={rangeError ? "" : undefined} + aria-invalid={rangeError} + aria-describedby={rangeError ? "fb-date-range-error" : undefined} + /> + {rangeError && ( + + “To” must be after “From” + + )} +
+ +
+
+ {/* Focus sentinel: the last focusable element inside the picker. + When Tab exits Apply (or To when Apply is disabled), the browser naturally + focuses this sentinel. onFocus immediately redirects back into the cycle, + keeping focus trapped inside the filter bar. It must live inside the picker + (inside the container) so handleBlur sees relatedTarget as within-container. */} +
+ ); +}); diff --git a/src/ui/src/components/filter-bar/filter-bar-dropdown.tsx b/src/ui/src/components/filter-bar/filter-bar-dropdown.tsx index ac77c3767..47281c2de 100644 --- a/src/ui/src/components/filter-bar/filter-bar-dropdown.tsx +++ b/src/ui/src/components/filter-bar/filter-bar-dropdown.tsx @@ -36,7 +36,14 @@ import { memo, useRef, useMemo, useEffect } from "react"; import { Loader2 } from "lucide-react"; import { CommandList, CommandItem, CommandGroup } from "@/components/shadcn/command"; import { useVirtualizerCompat } from "@/hooks/use-virtualizer-compat"; -import type { FieldSuggestion, PresetSuggestion, SearchPreset, Suggestion } from "@/components/filter-bar/lib/types"; +import type { + FieldSuggestion, + PresetSuggestion, + SearchPreset, + Suggestion, + DateRangeSearchField, +} from "@/components/filter-bar/lib/types"; +import { FilterBarDatePicker } from "@/components/filter-bar/filter-bar-date-picker"; // --------------------------------------------------------------------------- // Constants @@ -82,6 +89,12 @@ interface FilterBarDropdownProps { loadingFieldLabel?: string; /** Currently highlighted cmdk value — drives scroll-into-view */ highlightedSuggestionValue?: string; + /** Active date-range field, if any — renders date picker instead of suggestions */ + activeDateRangeField?: DateRangeSearchField | null; + /** Callback when a date value is committed from the date picker */ + onDateCommit?: (value: string) => void; + /** Advance the date picker cycle from a sentinel when Tab/Shift-Tab fires on a picker element */ + onDateCycleStep?: (direction: "forward" | "backward", fromValue: string) => void; } // --------------------------------------------------------------------------- @@ -99,6 +112,9 @@ function FilterBarDropdownInner({ isFieldLoading, loadingFieldLabel, highlightedSuggestionValue, + activeDateRangeField, + onDateCommit, + onDateCycleStep, }: FilterBarDropdownProps) { const listRef = useRef(null); @@ -166,37 +182,57 @@ function FilterBarDropdownInner({ overflow-hidden (not overflow-y-auto) means CommandList itself never scrolls; the inner .fb-suggestions-scroll is the sole scroll container. */} - {/* Presets — present in selectables when input is empty */} - {presetGroups.length > 0 && ( - - )} - - {/* Hints (non-interactive, shown above suggestions) */} - {hints.length > 0 && } - - {/* Async field loading state */} - {isFieldLoading ? ( - ) : ( - /* Suggestions - virtualized when large */ - fieldSelectables.length > 0 && ( - - ) + <> + {/* Presets — present in selectables when input is empty */} + {presetGroups.length > 0 && ( + + )} + + {/* Hints (non-interactive, shown above suggestions) */} + {hints.length > 0 && } + + {/* Async field loading state */} + {isFieldLoading ? ( + + ) : ( + /* Suggestions - virtualized when large */ + fieldSelectables.length > 0 && ( + + ) + )} + )} {/* Footer */}
- ↑↓ Tab fill{" "} - Enter accept Esc undo + {activeDateRangeField ? ( + <> + Tab navigate Enter apply{" "} + Esc cancel + + ) : ( + <> + ↑↓ Tab fill{" "} + Enter accept Esc undo + + )}
@@ -446,7 +482,10 @@ function SuggestionItemInner({ suggestion, onSelect }: SuggestionItemProps {suggestion.type === "field" ? ( {suggestion.label} ) : ( - {suggestion.label} + + {suggestion.field.prefix} + {suggestion.label.slice(suggestion.field.prefix.length)} + )} {suggestion.hint && {suggestion.hint}} diff --git a/src/ui/src/components/filter-bar/filter-bar.css b/src/ui/src/components/filter-bar/filter-bar.css index aa8122a25..ea86e646e 100644 --- a/src/ui/src/components/filter-bar/filter-bar.css +++ b/src/ui/src/components/filter-bar/filter-bar.css @@ -40,6 +40,10 @@ /* Dropdown max height */ --fb-dropdown-max-height: 380px; + + /* Date picker column sizing */ + --fb-date-right-min: 185px; + --fb-date-left-max: 270px; } /* Dark mode blue accents */ @@ -355,6 +359,239 @@ border-bottom: 1px solid var(--border); } +/* ============================================================================= + Date Picker Panel — B1 "Split Rail" layout + Left: preset list (label + right-aligned date hint) with active border indicator. + Right: stacked From/To inputs + Apply. + ============================================================================= */ + +.fb-date-picker { + overflow-y: auto; + overscroll-behavior: contain; +} + +/* + Right column minimum: + datetime-local input content ≈ 160px (mm/dd/yyyy, --:-- -- at 0.75rem + calendar icon) + .fb-date-custom h-padding = 24px (0.75rem × 2) + → --fb-date-right-min = 184px (rounded up to 185px) + + Left rail maximum (cap so extra space flows to the right column): + Row left-padding = 14px (0.875rem) + Longest label "last 365 days" ≈ 100px + Minimum gap (padding-left) = 16px (1rem, enforced on hint) + Longest hint "MMM DD 'YY – MMM DD" ≈ 130px (19 chars × ~6.6px/ch at 0.6875rem monospace) + Row right-padding = 10px (0.625rem) + ───────────────────────────────────────────────────── + Total ≈ 270px → --fb-date-left-max: 270px + + Grid: left capped at --fb-date-left-max, right gets all remaining space (min --fb-date-right-min). + + Hint hide threshold (= max content without right-padding): + 14px + 100px + 16px + 130px = 260px + Container query hides hints at rail < 260px so they are always fully shown or fully hidden. +*/ +.fb-date-split { + display: grid; + grid-template-columns: minmax(0, var(--fb-date-left-max)) minmax(var(--fb-date-right-min), 1fr); + min-height: 12rem; +} + +/* Left rail */ +.fb-date-rail { + border-right: 1px solid var(--border); + padding: 0.5rem 0; + display: flex; + flex-direction: column; + container-type: inline-size; + container-name: date-rail; +} + +.fb-date-section-label { + padding: 0 0.75rem 0.375rem; + font-size: 0.6875rem; + line-height: 1rem; + font-weight: 600; + letter-spacing: 0.08em; + text-transform: uppercase; + color: var(--muted-foreground); + pointer-events: none; + user-select: none; + opacity: 0.55; +} + +.fb-date-preset-row { + display: flex; + width: 100%; + align-items: center; + gap: 0.375rem; + padding: 0.4375rem 0.625rem 0.4375rem 0.875rem; + font-size: 0.8125rem; + line-height: 1.25rem; + color: var(--fb-accent); + cursor: pointer; + text-align: left; + border-left: 2px solid transparent; + transition: + background-color 150ms, + color 150ms, + border-color 150ms; +} + +.fb-date-preset-row:hover { + background-color: var(--accent); +} + +.fb-date-preset-row[data-active] { + border-left-color: var(--fb-accent); + background-color: color-mix(in oklch, var(--fb-accent) 10%, transparent); + color: var(--fb-accent); + font-weight: 500; + padding-left: 0.75rem; /* compensate for 2px border so text stays aligned */ +} + +.fb-date-preset-row:focus-visible { + outline: 2px solid var(--ring); + outline-offset: -2px; +} + +/* Label: never wraps or shrinks — always takes its natural width */ +.fb-date-preset-label { + flex-shrink: 0; + white-space: nowrap; +} + +/* Hint: right-aligned, minimum gap from label via padding-left. + Collapses to nothing when there is no room (min-width: 0 + overflow: hidden). + Fully hidden via container query before any clipping can occur. */ +.fb-date-preset-hint { + flex: 1; + min-width: 0; + overflow: hidden; + white-space: nowrap; + text-align: right; + /* padding-left enforces the minimum visual gap between label and hint */ + padding-left: 1rem; + font-size: 0.6875rem; + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; + color: var(--foreground); + opacity: 0.5; +} + +.fb-date-preset-row[data-active] .fb-date-preset-hint { + opacity: 0.7; + color: currentColor; +} + +/* Hide hints when the rail is narrower than the space needed to show them fully. + Threshold = row-padding + longest-label + min-gap + longest-hint = 14+100+16+130 = 260px. + This guarantees hints are always fully visible or completely absent. */ +@container date-rail (max-width: 259px) { + .fb-date-preset-hint { + display: none; + } +} + +/* Right column: custom range */ +.fb-date-custom { + padding: 0.5rem 0.75rem 0.625rem; + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.fb-date-custom .fb-date-section-label { + padding-left: 0; + padding-right: 0; +} + +.fb-date-field { + display: flex; + flex-direction: column; + gap: 0.25rem; +} + +.fb-date-label { + font-size: 0.6875rem; + line-height: 1rem; + color: var(--foreground); + opacity: 0.7; +} + +.fb-date-input { + width: 100%; + border-radius: 0.375rem; + border: 1px solid var(--border); + background-color: var(--background); + color: var(--foreground); + padding: 0.25rem 0.5rem; + font-size: 0.75rem; + line-height: 1.25rem; + transition: border-color 150ms; +} + +.fb-date-input:focus { + outline: none; + border-color: var(--ring); + box-shadow: 0 0 0 1px var(--ring); +} + +.fb-date-input[data-error] { + border-color: var(--destructive); + box-shadow: 0 0 0 1px var(--destructive); +} + +.fb-date-error { + font-size: 0.6875rem; + line-height: 1rem; + color: var(--destructive); +} + +.fb-date-input::-webkit-calendar-picker-indicator { + opacity: 0.45; + cursor: pointer; +} + +.fb-date-input::-webkit-calendar-picker-indicator:hover { + opacity: 1; +} + +.fb-date-apply { + display: flex; + align-items: center; + justify-content: center; + gap: 0.375rem; + border-radius: 0.375rem; + border: 1px solid var(--border); + background-color: var(--secondary); + color: var(--fb-accent); + padding: 0.375rem 0.75rem; + font-size: 0.8125rem; + line-height: 1.25rem; + font-weight: 500; + cursor: pointer; + transition: + background-color 150ms, + color 150ms; + width: 100%; + margin-top: auto; +} + +.fb-date-apply:hover:not(:disabled) { + background-color: var(--accent); + color: var(--accent-foreground); +} + +.fb-date-apply:disabled { + opacity: 0.4; + cursor: not-allowed; +} + +.fb-date-apply:focus-visible { + outline: 2px solid var(--ring); + outline-offset: 2px; +} + /* ============================================================================= Animations ============================================================================= */ diff --git a/src/ui/src/components/filter-bar/filter-bar.tsx b/src/ui/src/components/filter-bar/filter-bar.tsx index c6708a6a5..596f7b1db 100644 --- a/src/ui/src/components/filter-bar/filter-bar.tsx +++ b/src/ui/src/components/filter-bar/filter-bar.tsx @@ -83,6 +83,9 @@ function FilterBarInner( handleBackdropDismiss, handleKeyDown, isPresetActive, + activeDateRangeField, + handleDateCommit, + stepDateCycle, setInputRefCallbacks, } = useFilterState({ chips, @@ -183,6 +186,9 @@ function FilterBarInner( isFieldLoading={isFieldLoading} loadingFieldLabel={loadingFieldLabel} highlightedSuggestionValue={highlightedSuggestionValue} + activeDateRangeField={activeDateRangeField} + onDateCommit={handleDateCommit} + onDateCycleStep={stepDateCycle} /> diff --git a/src/ui/src/components/filter-bar/hooks/use-default-filter.ts b/src/ui/src/components/filter-bar/hooks/use-default-filter.ts index a8c553fff..c536678b9 100644 --- a/src/ui/src/components/filter-bar/hooks/use-default-filter.ts +++ b/src/ui/src/components/filter-bar/hooks/use-default-filter.ts @@ -22,15 +22,18 @@ import type { SearchChip } from "@/stores/types"; import { useUrlChips } from "@/components/filter-bar/hooks/use-url-chips"; /** - * Adds a default filter chip when no chip for `field` is present in the URL, + * Adds default filter chip(s) when no chip for `field` is present in the URL, * unless the user has explicitly opted out via `?{optOutParam}=true`. * + * Accepts a single `defaultValue` string or an array of strings (for multi-chip defaults + * like defaulting to all running-category statuses). + * * - effectiveChips is computed synchronously so the first render uses correct params (no double-fetch). * - The chip is NOT written to the URL on mount — only when the user interacts with the filter bar. * Writing it on mount caused a race with nuqs: the effect fired before nuqs had fully parsed all * repeated `f=` URL params, so it overwrote the URL with a partial chip set (losing all but the first). * - Removing all chips for `field` sets ?{optOutParam}=true; adding one clears it. - * - After nuqs has parsed the URL (post-paint), an effect writes the default chip to the URL + * - After nuqs has parsed the URL (post-paint), an effect writes the default chip(s) to the URL * using history:"replace" so that refresh/share reflects the active filter. */ export function useDefaultFilter({ @@ -40,7 +43,7 @@ export function useDefaultFilter({ optOutParam = "all", }: { field: string; - defaultValue: string | null | undefined; + defaultValue: string | string[] | null | undefined; label?: string; optOutParam?: string; }): { @@ -50,6 +53,11 @@ export function useDefaultFilter({ } { const { searchChips, setSearchChips } = useUrlChips(); + const normalizedDefaults = useMemo( + () => (defaultValue == null ? [] : Array.isArray(defaultValue) ? defaultValue : [defaultValue]), + [defaultValue], + ); + const [optOut, setOptOut] = useQueryState( optOutParam, parseAsBoolean.withDefault(false).withOptions({ @@ -61,20 +69,40 @@ export function useDefaultFilter({ const hasDefaultInUrl = useMemo(() => searchChips.some((c) => c.field === field), [searchChips, field]); - const shouldPrePopulate = !optOut && !hasDefaultInUrl && !!defaultValue; + // When both optOut and explicit chips are present, explicit chips win. + // Compute this synchronously so callers never see a transient contradictory state. + const effectiveOptOut = optOut && !hasDefaultInUrl; + + const shouldPrePopulate = !optOut && !hasDefaultInUrl && normalizedDefaults.length > 0; const effectiveChips = useMemo((): SearchChip[] => { if (!shouldPrePopulate) return searchChips; - const chipLabel = label ?? `${field}: ${defaultValue}`; - return [...searchChips, { field, value: defaultValue!, label: chipLabel }]; - }, [searchChips, shouldPrePopulate, field, defaultValue, label]); + const prefix = label ?? field; + const defaults: SearchChip[] = normalizedDefaults.map((v) => ({ + field, + value: v, + label: normalizedDefaults.length > 1 ? `${prefix}: ${v}` : (label ?? `${field}: ${v}`), + })); + return [...searchChips, ...defaults]; + }, [searchChips, shouldPrePopulate, field, normalizedDefaults, label]); + + // If the URL has the opt-out flag set but also has explicit chips for this field, + // the two are contradictory. The explicit chips win — clear the opt-out so downstream + // consumers (e.g. showAllUsers) don't ignore the manual filter. + useEffect(() => { + if (optOut && hasDefaultInUrl) void setOptOut(false); + }, [optOut, hasDefaultInUrl, setOptOut]); useEffect(() => { if (!shouldPrePopulate) return; - const chipLabel = label ?? `${field}: ${defaultValue}`; - const defaultChip: SearchChip = { field, value: defaultValue!, label: chipLabel }; - void setSearchChips([...searchChips, defaultChip], { history: "replace" }); - }, [shouldPrePopulate, searchChips, field, defaultValue, label, setSearchChips]); + const prefix = label ?? field; + const defaults: SearchChip[] = normalizedDefaults.map((v) => ({ + field, + value: v, + label: normalizedDefaults.length > 1 ? `${prefix}: ${v}` : (label ?? `${field}: ${v}`), + })); + void setSearchChips([...searchChips, ...defaults], { history: "replace" }); + }, [shouldPrePopulate, searchChips, field, normalizedDefaults, label, setSearchChips]); const handleChipsChange = useCallback( (newChips: SearchChip[]) => { @@ -91,5 +119,5 @@ export function useDefaultFilter({ [effectiveChips, field, optOut, setOptOut, setSearchChips], ); - return { effectiveChips, handleChipsChange, optOut }; + return { effectiveChips, handleChipsChange, optOut: effectiveOptOut }; } diff --git a/src/ui/src/components/filter-bar/hooks/use-filter-keyboard.ts b/src/ui/src/components/filter-bar/hooks/use-filter-keyboard.ts index 1721831ff..ae01cf8ee 100644 --- a/src/ui/src/components/filter-bar/hooks/use-filter-keyboard.ts +++ b/src/ui/src/components/filter-bar/hooks/use-filter-keyboard.ts @@ -28,6 +28,12 @@ import { useState, useCallback, useMemo } from "react"; import type { FieldSuggestion, ParsedInput, SearchField, Suggestion } from "@/components/filter-bar/lib/types"; +import { + isDateRangeField, + DATE_CUSTOM_FROM, + DATE_CUSTOM_TO, + DATE_CUSTOM_APPLY, +} from "@/components/filter-bar/lib/types"; // --------------------------------------------------------------------------- // External interfaces @@ -74,6 +80,12 @@ interface UseFilterKeyboardReturn { navigationLevel: "field" | "value" | null; /** Reset all navigation state */ resetNavigation: () => void; + /** + * Advance the cycle one step from a known sentinel position. + * Used when Tab/Shift-Tab fires on a date-picker DOM element (not the filter bar input) + * so the stale highlightedIndex is bypassed. + */ + stepCycle: (direction: "forward" | "backward", fromValue: string) => void; } // --------------------------------------------------------------------------- @@ -132,6 +144,28 @@ export function useFilterKeyboard( [navState, resetNavigation], ); + const stepCycle = useCallback( + (direction: "forward" | "backward", fromValue: string) => { + actions.focusInput(); + if (nav.state.level === null || nav.state.items.length === 0) { + enterNavigationMode(state, nav, actions, direction); + return; + } + const fromIdx = nav.state.items.findIndex((item) => item.value === fromValue); + const startIdx = fromIdx >= 0 ? fromIdx : nav.state.highlightedIndex; + const nextIdx = + direction === "forward" + ? (startIdx + 1) % nav.state.items.length + : startIdx <= 0 + ? nav.state.items.length - 1 + : startIdx - 1; + nav.setState({ ...nav.state, highlightedIndex: nextIdx }); + const item = nav.state.items[nextIdx]; + if (item) actions.fillInput(item.inputValue); + }, + [state, nav, actions], + ); + const handleKeyDown = useCallback( (e: React.KeyboardEvent) => { switch (e.key) { @@ -173,6 +207,7 @@ export function useFilterKeyboard( displaySelectables, navigationLevel: navState.level, resetNavigation, + stepCycle, }; } @@ -365,6 +400,26 @@ function onEnter( } if (nav.state.level === "value" && nav.state.highlightedIndex >= 0) { + const current = nav.state.items[nav.state.highlightedIndex]; + if ( + current && + current.suggestion.type !== "preset" && + current.suggestion.type !== "hint" && + isDateRangeField(current.suggestion.field) + ) { + // Sentinel items (From/To/Apply): focus is already moved by the date picker's useEffect. + // Don't commit — let the focused element handle subsequent interaction. + if ( + current.value === DATE_CUSTOM_FROM || + current.value === DATE_CUSTOM_TO || + current.value === DATE_CUSTOM_APPLY + ) { + return; + } + // Date-range preset: select directly (parsedInput.query is empty, input held at prefix) + actions.selectSuggestion(current.value); + return; + } if (parsedInput.hasPrefix && parsedInput.field && parsedInput.query.trim()) { if (actions.addChipFromParsedInput()) { actions.resetInput(); @@ -436,7 +491,9 @@ function buildValueCycleItems(selectables: Suggestion[]): CycleItem[] { .filter((s): s is FieldSuggestion => s.type !== "preset") .map((s) => ({ value: s.value, - inputValue: s.field.prefix ? `${s.field.prefix}${s.value}` : s.value, + // Date-range presets: keep input at just the prefix so the picker stays open + // and shows all presets; the highlighted preset is indicated via the picker's UI. + inputValue: isDateRangeField(s.field) ? s.field.prefix : s.field.prefix ? `${s.field.prefix}${s.value}` : s.value, suggestion: s, })); } diff --git a/src/ui/src/components/filter-bar/hooks/use-filter-state.ts b/src/ui/src/components/filter-bar/hooks/use-filter-state.ts index 4f29c359d..34a36761f 100644 --- a/src/ui/src/components/filter-bar/hooks/use-filter-state.ts +++ b/src/ui/src/components/filter-bar/hooks/use-filter-state.ts @@ -31,8 +31,14 @@ */ import { useState, useCallback, useMemo, useRef, useEffect } from "react"; -import type { SearchChip, SearchField, SearchPreset, Suggestion } from "@/components/filter-bar/lib/types"; -import { isAsyncField } from "@/components/filter-bar/lib/types"; +import type { + SearchChip, + SearchField, + SearchPreset, + Suggestion, + DateRangeSearchField, +} from "@/components/filter-bar/lib/types"; +import { isAsyncField, isDateRangeField } from "@/components/filter-bar/lib/types"; import { useChips } from "@/components/filter-bar/hooks/use-chips"; import { useSuggestions } from "@/components/filter-bar/hooks/use-suggestions"; import { useFilterKeyboard } from "@/components/filter-bar/hooks/use-filter-keyboard"; @@ -83,6 +89,14 @@ interface UseFilterStateReturn { // Preset state isPresetActive: (preset: SearchPreset) => boolean; + // Date range + /** Active date-range field (when user has typed its prefix), or null */ + activeDateRangeField: DateRangeSearchField | null; + /** Commit a date value for the active date-range field */ + handleDateCommit: (value: string) => void; + /** Advance the date picker cycle from a sentinel position (for Tab/Shift-Tab wrap) */ + stepDateCycle: (direction: "forward" | "backward", fromValue: string) => void; + // Ref setup setInputRefCallbacks: (callbacks: InputRefCallbacks) => void; } @@ -246,6 +260,17 @@ export function useFilterState({ clearValidationError(); }, [clearValidationError]); + const handleDateCommit = useCallback( + (value: string) => { + if (parsedInput.field && isDateRangeField(parsedInput.field)) { + addChip(parsedInput.field, value); + resetInput(); + inputCallbacksRef.current.focus(); + } + }, + [parsedInput, addChip, resetInput], + ); + // ========== Keyboard hook ========== const keyboardState = useMemo>( @@ -287,7 +312,7 @@ export function useFilterState({ [removeChip, resetInput, parsedInput, addChip, handleSelect, setValidationError, addTextChip], ); - const { handleKeyDown, highlightedSuggestionValue, displaySelectables, navigationLevel, resetNavigation } = + const { handleKeyDown, highlightedSuggestionValue, displaySelectables, navigationLevel, resetNavigation, stepCycle } = useFilterKeyboard(keyboardState, keyboardActions); // Wire up the bridge ref so resetInput/handleInputChange/handleClearAll can reset navigation @@ -312,9 +337,23 @@ export function useFilterState({ // with a prefix, hints flow naturally from the matched field. const visibleHints = navigationLevel === "field" ? [] : hints; + // Derive active date-range field. + // Suppress while navigationLevel === "field": the user is still cycling through field + // suggestions (input is temporarily filled with each prefix as a preview). Only activate + // the date picker once the user has committed — either by pressing Enter (transitions to + // "value" level) or by typing the prefix themselves (level === null). + const activeDateRangeField = useMemo((): DateRangeSearchField | null => { + if (navigationLevel === "field") return null; + if (parsedInput.hasPrefix && parsedInput.field && isDateRangeField(parsedInput.field)) { + return parsedInput.field; + } + return null; + }, [parsedInput, navigationLevel]); + const hasContent = displaySelectables.length > 0 || visibleHints.length > 0; // Preset suggestions are in displaySelectables when input is empty, so hasContent covers them. - const showDropdown = (isOpen && hasContent) || !!validationError || isFieldLoading; + // Also show dropdown when a date-range field is active (to show the date picker). + const showDropdown = (isOpen && (hasContent || !!activeDateRangeField)) || !!validationError || isFieldLoading; // ========== Return ========== @@ -337,6 +376,9 @@ export function useFilterState({ handleBackdropDismiss, handleKeyDown, isPresetActive, + activeDateRangeField, + handleDateCommit, + stepDateCycle: stepCycle, setInputRefCallbacks, }; } diff --git a/src/ui/src/components/filter-bar/hooks/use-suggestions.ts b/src/ui/src/components/filter-bar/hooks/use-suggestions.ts index 3b1bc1740..2097a9f02 100644 --- a/src/ui/src/components/filter-bar/hooks/use-suggestions.ts +++ b/src/ui/src/components/filter-bar/hooks/use-suggestions.ts @@ -23,8 +23,15 @@ import { useMemo } from "react"; import type { SearchField, SearchChip, SearchPreset, Suggestion, ParsedInput } from "@/components/filter-bar/lib/types"; -import { getFieldValues } from "@/components/filter-bar/lib/types"; +import { + getFieldValues, + isDateRangeField, + DATE_CUSTOM_FROM, + DATE_CUSTOM_TO, + DATE_CUSTOM_APPLY, +} from "@/components/filter-bar/lib/types"; import { parseInput, getFieldHint } from "@/components/filter-bar/lib/parse-input"; +import { DATE_RANGE_PRESETS } from "@/lib/date-range-utils"; interface UseSuggestionsOptions { /** Current input value */ @@ -105,6 +112,29 @@ function generateSuggestions( } if (parsedInput.hasPrefix && parsedInput.field) { + // Date-range fields show a picker in the dropdown — surface presets and custom inputs + // as value suggestions so keyboard navigation (Tab/↑↓/Enter) works like other fields. + if (isDateRangeField(parsedInput.field)) { + const lowerQuery = parsedInput.query.toLowerCase(); + for (const preset of DATE_RANGE_PRESETS) { + if (!lowerQuery || preset.label.toLowerCase().includes(lowerQuery)) { + items.push({ + type: "value", + field: parsedInput.field, + value: preset.label, + label: preset.label, + }); + } + } + // Custom range inputs are always shown (only when not filtering presets) + if (!lowerQuery) { + items.push({ type: "value", field: parsedInput.field, value: DATE_CUSTOM_FROM, label: "From date" }); + items.push({ type: "value", field: parsedInput.field, value: DATE_CUSTOM_TO, label: "To date" }); + items.push({ type: "value", field: parsedInput.field, value: DATE_CUSTOM_APPLY, label: "Apply" }); + } + return items; + } + // Show values for the selected field const field = parsedInput.field; const currentPrefix = field.prefix; diff --git a/src/ui/src/components/filter-bar/hooks/use-url-chips.ts b/src/ui/src/components/filter-bar/hooks/use-url-chips.ts index bed7f7cb6..c86e8c5a2 100644 --- a/src/ui/src/components/filter-bar/hooks/use-url-chips.ts +++ b/src/ui/src/components/filter-bar/hooks/use-url-chips.ts @@ -17,7 +17,7 @@ "use client"; import { useMemo, useCallback } from "react"; -import { useQueryState, parseAsArrayOf, parseAsString } from "nuqs"; +import { useQueryState, createMultiParser } from "nuqs"; import type { SearchChip } from "@/stores/types"; import { parseUrlChips } from "@/lib/url-utils"; @@ -25,14 +25,25 @@ export interface SetSearchChipsOptions { history?: "push" | "replace"; } -/** - * URL-synced search chips. Parses "field:value" from repeated URL params (?f=field:value) - * into SearchChip[] and writes changes back to the URL for shareable filtered views. - */ +// Uses repeated query params (?f=pool:X&f=user:Y) for filter chips. +// type:"multi" makes nuqs call searchParams.getAll(), collecting repeated params. +// Each param value is one chip string — no secondary separator that could corrupt +// values containing commas. +const parseAsChipStrings = createMultiParser({ + parse: (values: readonly string[]) => values.filter(Boolean), + serialize: (values: readonly string[]) => Array.from(values), + eq: (a: string[], b: string[]) => { + if (a.length !== b.length) return false; + const sortedA = [...a].sort(); + const sortedB = [...b].sort(); + return sortedA.every((v, i) => v === sortedB[i]); + }, +}); + export function useUrlChips({ paramName = "f" }: { paramName?: string } = {}) { const [filterStrings, setFilterStrings] = useQueryState( paramName, - parseAsArrayOf(parseAsString).withOptions({ + parseAsChipStrings.withOptions({ shallow: true, history: "push", clearOnDefault: true, diff --git a/src/ui/src/components/filter-bar/lib/preset-pill.ts b/src/ui/src/components/filter-bar/lib/preset-pill.ts new file mode 100644 index 000000000..9a812a743 --- /dev/null +++ b/src/ui/src/components/filter-bar/lib/preset-pill.ts @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +import { cn } from "@/lib/utils"; + +export function presetPillClasses( + bgClass: string, + active: boolean, + activeRingClass = "ring-black/15 ring-inset dark:ring-white/20", +): string { + return cn( + "inline-flex items-center gap-1.5 rounded px-2 py-0.5 transition-all", + bgClass, + active && `ring-2 ${activeRingClass}`, + "group-data-[selected=true]:scale-105 group-data-[selected=true]:shadow-lg", + !active && "opacity-70 group-data-[selected=true]:opacity-100 hover:opacity-100", + ); +} diff --git a/src/ui/src/components/filter-bar/lib/types.ts b/src/ui/src/components/filter-bar/lib/types.ts index 800d76a65..b3cf410c4 100644 --- a/src/ui/src/components/filter-bar/lib/types.ts +++ b/src/ui/src/components/filter-bar/lib/types.ts @@ -136,16 +136,27 @@ export interface AsyncSearchField extends BaseSearchField { } /** - * A search field that is either sync (derives values from parent data) - * or async (loads its own data from an API). + * Date-range search field: shows a date picker in the dropdown instead of text suggestions. + * Selecting a date or range creates a chip with an ISO date string or range string. + * @template T - The data item type (kept for union compatibility) + */ +export interface DateRangeSearchField extends BaseSearchField { + /** Discriminant: date-range fields render a date picker in the dropdown */ + type: "date-range"; +} + +/** + * A search field that is either sync (derives values from parent data), + * async (loads its own data from an API), or date-range (renders a date picker). * * Use the `type` discriminant to narrow: * - `type: undefined | 'sync'` -> SyncSearchField (default, backward compatible) * - `type: 'async'` -> AsyncSearchField (self-loading, has isLoading) + * - `type: 'date-range'` -> DateRangeSearchField (shows date picker in dropdown) * * @template T - The data item type being searched */ -export type SearchField = SyncSearchField | AsyncSearchField; +export type SearchField = SyncSearchField | AsyncSearchField | DateRangeSearchField; /** * Type guard: check if a field is async. @@ -155,11 +166,20 @@ export function isAsyncField(field: SearchField): field is AsyncSearchFiel return field.type === "async"; } +/** + * Type guard: check if a field is a date-range field. + */ +export function isDateRangeField(field: SearchField): field is DateRangeSearchField { + return field.type === "date-range"; +} + /** * Get values from a field, handling both sync and async variants. * For sync fields, passes the data array. For async fields, calls with no args. + * For date-range fields, returns empty array (picker shown in dropdown instead). */ export function getFieldValues(field: SearchField, data: T[]): string[] { + if (isDateRangeField(field)) return []; if (isAsyncField(field)) { return field.getValues(); } @@ -324,6 +344,15 @@ export interface PresetSuggestion { */ export type Suggestion = FieldSuggestion | PresetSuggestion; +/** + * Sentinel suggestion values for date picker custom input navigation. + * When the keyboard cycle highlights one of these, the date picker + * moves DOM focus to the corresponding element (From/To input or Apply button). + */ +export const DATE_CUSTOM_FROM = "__date-custom-from__"; +export const DATE_CUSTOM_TO = "__date-custom-to__"; +export const DATE_CUSTOM_APPLY = "__date-custom-apply__"; + /** * Parsed input result. */ diff --git a/src/ui/src/components/inline-progress.tsx b/src/ui/src/components/inline-progress.tsx index e970f5a13..d49361dea 100644 --- a/src/ui/src/components/inline-progress.tsx +++ b/src/ui/src/components/inline-progress.tsx @@ -27,35 +27,23 @@ export interface InlineProgressProps { total: number; /** Compact mode: hide progress bar, show only text */ compact?: boolean; - /** Width of the progress bar */ - barWidth?: string; /** Additional content to render after the label (e.g., icons) */ children?: React.ReactNode; /** Additional className for the container */ className?: string; } -// ============================================================================= -// Component -// ============================================================================= - /** - * InlineProgress - Horizontal progress display for table cells. - * - * Renders a progress bar with a "{used}/{total}" fraction label. - * Suitable for table cells showing utilization. + * Horizontal progress display for table cells. * * @example - * ```tsx * * - * ``` */ export const InlineProgress = memo(function InlineProgress({ used, total, compact = false, - barWidth = "w-16", children, className, }: InlineProgressProps) { @@ -70,11 +58,9 @@ export const InlineProgress = memo(function InlineProgress({ ); } - const maxBarWidth = barWidth.replace(/^w-/, "max-w-"); - return (
-
+
- {label} + + {label} + {children}
); diff --git a/src/ui/src/components/log-viewer/components/footer.tsx b/src/ui/src/components/log-viewer/components/footer.tsx deleted file mode 100644 index d147e43fd..000000000 --- a/src/ui/src/components/log-viewer/components/footer.tsx +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -"use client"; - -import { memo } from "react"; -import { Download, ExternalLink, Tag, WrapText } from "lucide-react"; -import { cn } from "@/lib/utils"; -import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/shadcn/tooltip"; -import { ScrollPinControl } from "@/components/log-viewer/components/scroll-pin-control"; - -// ============================================================================= -// Types -// ============================================================================= - -export interface FooterProps { - /** Whether line wrapping is enabled */ - wrapLines: boolean; - /** Callback to toggle line wrapping */ - onToggleWrapLines: () => void; - /** Whether task suffix is shown */ - showTask: boolean; - /** Callback to toggle task suffix (hidden when undefined, e.g., task scope) */ - onToggleShowTask?: () => void; - /** URL to open raw logs in new tab (direct to backend) */ - externalLogUrl?: string; - /** Callback to download logs */ - onDownload?: () => void; - /** Additional CSS classes */ - className?: string; - /** Whether streaming is active (shows pin option when true) */ - isStreaming?: boolean; - /** Whether pinned to bottom (auto-scrolls on new entries) */ - isPinnedToBottom?: boolean; - /** Callback to scroll to bottom immediately */ - onScrollToBottom?: () => void; - /** Callback to toggle pin state */ - onTogglePinnedToBottom?: () => void; -} - -// ============================================================================= -// Component -// ============================================================================= - -function FooterInner({ - wrapLines, - onToggleWrapLines, - showTask, - onToggleShowTask, - externalLogUrl, - onDownload, - className, - isStreaming = false, - isPinnedToBottom = false, - onScrollToBottom, - onTogglePinnedToBottom, -}: FooterProps) { - return ( -
-
- {/* Right-aligned controls in order: scroll, task, wrap, download, new tab */} -
- {/* Scroll/Pin controls */} - {onScrollToBottom && onTogglePinnedToBottom && ( - - )} - - {/* Show task toggle (hidden when scoped to a single task) */} - {onToggleShowTask && ( - - - - - {showTask ? "Hide" : "Show"} task - - )} - - {/* Wrap lines toggle */} - - - - - {wrapLines ? "Disable" : "Enable"} line wrap - - - {/* Download button */} - {onDownload && ( - - - - - Download logs - - )} - - {/* External link - opens raw logs in new tab */} - {externalLogUrl && ( - - - - - Open raw logs in new tab - - - Open raw logs in new tab - - )} -
-
-
- ); -} - -export const Footer = memo(FooterInner); diff --git a/src/ui/src/components/log-viewer/components/log-viewer.tsx b/src/ui/src/components/log-viewer/components/log-viewer.tsx index 7548e6c17..b81b6e784 100644 --- a/src/ui/src/components/log-viewer/components/log-viewer.tsx +++ b/src/ui/src/components/log-viewer/components/log-viewer.tsx @@ -18,7 +18,7 @@ import { memo, useMemo, useRef, useCallback, useEffect, useState, startTransition, useDeferredValue } from "react"; import { useShallow } from "zustand/react/shallow"; -import { User, Cpu, ZoomIn, ZoomOut } from "lucide-react"; +import { User, Cpu, ZoomIn, ZoomOut, Download, ExternalLink, Tag, WrapText } from "lucide-react"; import { cn } from "@/lib/utils"; import { useFormattedHotkey, useModKey } from "@/hooks/use-hotkey-label"; import type { LogEntry, HistogramBucket } from "@/lib/api/log-adapter/types"; @@ -36,7 +36,7 @@ import { type TimelineContainerHandle, } from "@/components/log-viewer/components/timeline/components/timeline-container"; import { LogList, type LogListHandle } from "@/components/log-viewer/components/log-list"; -import { Footer } from "@/components/log-viewer/components/footer"; +import { ScrollPinControl } from "@/components/log-viewer/components/scroll-pin-control"; import { LogViewerSkeleton } from "@/components/log-viewer/components/log-viewer-skeleton"; import { useLogViewerStore } from "@/components/log-viewer/store/log-viewer-store"; import { HISTOGRAM_BUCKET_JUMP_WINDOW_MS } from "@/components/log-viewer/lib/constants"; @@ -476,20 +476,117 @@ function LogViewerInner({ data, filter, timeline, className, showTimeline = true /> )} - {/* Section 1: Filter bar — excluded from focus redirect so dropdown items work */} + {/* Section 1: Filter bar + Actions — excluded from focus redirect so dropdown items work */}
- +
+
+ +
+ + {/* Action buttons */} +
+ {/* Scroll/Pin controls */} + + + {/* Show task toggle (hidden when scoped to a single task) */} + {scope !== "task" && ( + + + + + {showTask ? "Hide" : "Show"} task + + )} + + {/* Wrap lines toggle */} + + + + + {wrapLines ? "Disable" : "Enable"} line wrap + + + {/* Download button */} + + + + + Download logs + + + {/* External link - opens raw logs in new tab */} + {externalLogUrl && ( + + + + + Open raw logs in new tab + + )} +
+
{/* Section 2: Timeline Histogram — excluded from focus redirect so draggers work */} @@ -584,22 +681,6 @@ function LogViewerInner({ data, filter, timeline, className, showTimeline = true hideTask={scope === "task"} />
- - {/* Section 4: Footer */} -
-
-
); } diff --git a/src/ui/src/components/log-viewer/components/scroll-pin-control.tsx b/src/ui/src/components/log-viewer/components/scroll-pin-control.tsx index 77fcf21ab..947241125 100644 --- a/src/ui/src/components/log-viewer/components/scroll-pin-control.tsx +++ b/src/ui/src/components/log-viewer/components/scroll-pin-control.tsx @@ -19,6 +19,7 @@ import { memo } from "react"; import { ChevronsDown } from "lucide-react"; import { cn } from "@/lib/utils"; +import { Button } from "@/components/shadcn/button"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/shadcn/tooltip"; // ============================================================================= @@ -38,13 +39,6 @@ export interface ScrollPinControlProps { className?: string; } -// ============================================================================= -// Shared Styles -// ============================================================================= - -const toolbarButtonStyles = - "rounded p-1 transition-colors focus-nvidia disabled:pointer-events-none disabled:opacity-50"; - // ============================================================================= // Component // ============================================================================= @@ -70,20 +64,23 @@ function ScrollPinControlInner({ return ( - + - + {isHighlighted ? "Auto-following (click to disable)" : "Scroll to bottom"} diff --git a/src/ui/src/components/refresh/use-refresh-control-state.ts b/src/ui/src/components/refresh/use-refresh-control-state.ts index f6dfd7d37..25e509fa7 100644 --- a/src/ui/src/components/refresh/use-refresh-control-state.ts +++ b/src/ui/src/components/refresh/use-refresh-control-state.ts @@ -15,6 +15,7 @@ // SPDX-License-Identifier: Apache-2.0 import { useCallback } from "react"; +import { useInterval } from "usehooks-ts"; import { useMounted } from "@/hooks/use-mounted"; import { useRefreshAnimation } from "@/components/refresh/use-refresh-animation"; import { formatInterval } from "@/lib/format-interval"; @@ -40,6 +41,10 @@ export function useRefreshControlState(props: RefreshControlProps) { [setInterval], ); + // Auto-refresh timer: fires onRefresh at the selected interval. + // useInterval handles callback stability internally and accepts null to pause. + useInterval(onRefresh, mounted && isAutoRefreshActive ? interval : null); + return { mounted, clickCount, diff --git a/src/ui/src/components/refresh/vertical-refresh-control.tsx b/src/ui/src/components/refresh/vertical-refresh-control.tsx index fc445d16e..89b74de02 100644 --- a/src/ui/src/components/refresh/vertical-refresh-control.tsx +++ b/src/ui/src/components/refresh/vertical-refresh-control.tsx @@ -134,7 +134,13 @@ export function VerticalRefreshControl(props: RefreshControlProps) { Refresh workflow - {isRefreshing ? "Refreshing..." : "Refresh workflow"} + + {isRefreshing + ? "Refreshing..." + : isAutoRefreshActive + ? `Refresh workflow (auto-refresh: ${intervalLabel})` + : "Refresh workflow"} + @@ -153,7 +159,7 @@ export function VerticalRefreshControl(props: RefreshControlProps) { > {isAutoRefreshActive ? (