diff --git a/deployments/charts/quick-start/templates/mek-configmap.yaml b/deployments/charts/quick-start/templates/mek-configmap.yaml index 75471157e..ba93a551a 100644 --- a/deployments/charts/quick-start/templates/mek-configmap.yaml +++ b/deployments/charts/quick-start/templates/mek-configmap.yaml @@ -53,8 +53,13 @@ spec: exit 1 fi + # Generate a unique kid for this MEK to make key material mismatches + # detectable. A static kid (e.g. "key1") causes get_mek() to silently + # return the wrong key after MEK regeneration. See NVIDIA/OSMO#731. + KID="key-$(dd if=/dev/urandom bs=8 count=1 2>/dev/null | od -An -tx1 | tr -d ' \n')" + # Create JWK JSON structure and encode it - JWK_JSON='{"k":"'$RANDOM_KEY'","kid":"key1","kty":"oct"}' + JWK_JSON='{"k":"'$RANDOM_KEY'","kid":"'$KID'","kty":"oct"}' ENCODED_JWK=$(echo -n "$JWK_JSON" | base64 | tr -d '\n') # Get current timestamp @@ -70,9 +75,9 @@ spec: data: mek.yaml: | # MEK generated $TIMESTAMP - currentMek: key1 + currentMek: $KID meks: - key1: $ENCODED_JWK + $KID: $ENCODED_JWK EOF echo "Generated MEK ConfigMap" diff --git a/deployments/scripts/deploy-k8s.sh b/deployments/scripts/deploy-k8s.sh index 0c52db26b..cbbbb1751 100755 --- a/deployments/scripts/deploy-k8s.sh +++ b/deployments/scripts/deploy-k8s.sh @@ -270,24 +270,34 @@ create_secrets() { $RUN_KUBECTL "delete secret redis-secret --namespace $OSMO_NAMESPACE --ignore-not-found=true" $RUN_KUBECTL "create secret generic redis-secret --from-literal=redis-password=$REDIS_PASSWORD --namespace $OSMO_NAMESPACE" - # Generate and create MEK - log_info "Generating Master Encryption Key (MEK)..." - local random_key=$(openssl rand -base64 32 | tr -d '\n') - local jwk_json="{\"k\":\"$random_key\",\"kid\":\"key1\",\"kty\":\"oct\"}" - local encoded_jwk=$(echo -n "$jwk_json" | base64 | tr -d '\n') - - local mek_manifest="apiVersion: v1 + # Generate and create MEK (skip if already exists to avoid key material mismatch) + if $RUN_KUBECTL "get configmap mek-config -n $OSMO_NAMESPACE" >/dev/null 2>&1; then + log_info "MEK ConfigMap already exists, skipping generation" + else + log_info "Generating Master Encryption Key (MEK)..." + local random_key + random_key="$(openssl rand -base64 32 | tr -d '\n')" + # Use a unique kid per generation to make key material mismatches + # detectable. See https://github.com/NVIDIA/OSMO/issues/731 + local kid + kid="key-$(openssl rand -hex 8)" + local jwk_json="{\"k\":\"$random_key\",\"kid\":\"$kid\",\"kty\":\"oct\"}" + local encoded_jwk + encoded_jwk="$(echo -n "$jwk_json" | base64 | tr -d '\n')" + + local mek_manifest="apiVersion: v1 kind: ConfigMap metadata: name: mek-config namespace: $OSMO_NAMESPACE data: mek.yaml: | - currentMek: key1 + currentMek: $kid meks: - key1: $encoded_jwk" + $kid: $encoded_jwk" - $RUN_KUBECTL_APPLY_STDIN "$mek_manifest" + $RUN_KUBECTL_APPLY_STDIN "$mek_manifest" + fi log_success "Secrets created" } diff --git a/run/start_service_kind.py b/run/start_service_kind.py index 91d74dad3..51224f428 100644 --- a/run/start_service_kind.py +++ b/run/start_service_kind.py @@ -132,9 +132,10 @@ def _generate_mek() -> None: random_key = base64.b64encode(secrets.token_bytes(32)).decode('utf-8') + kid = f'key-{secrets.token_hex(8)}' jwk_json = { 'k': random_key, - 'kid': 'key1', + 'kid': kid, 'kty': 'oct' } @@ -149,9 +150,9 @@ def _generate_mek() -> None: data: mek.yaml: | # MEK generated {time.strftime('%Y-%m-%d %H:%M:%S')} - currentMek: key1 + currentMek: {kid} meks: - key1: {encoded_jwk} + {kid}: {encoded_jwk} """ with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file: diff --git a/src/service/core/service.py b/src/service/core/service.py index 208381bd1..da3e4be9b 100644 --- a/src/service/core/service.py +++ b/src/service/core/service.py @@ -228,6 +228,17 @@ async def user_error_handler(request: fastapi.Request, error: osmo_errors.OSMOEr async def top_level_exception_handler(request: fastapi.Request, error: Exception): logging.exception('Got an exception of type %s on url path %s', type(error).__name__, request.url.path) + # WebSocket connections cannot receive HTTP responses — attempting to send + # a JSONResponse on an accepted WebSocket triggers an ASGI protocol error: + # "Expected 'websocket.send' but got 'websocket.http.response.start'" + # Detect WebSocket scope and close gracefully instead. + if request.scope.get('type') == 'websocket': + try: + websocket = fastapi.WebSocket(request.scope, request.receive, request._send) + await websocket.close(code=1011, reason=str(error)[:123]) + except Exception: + pass # Connection may already be closed + return None return fastapi.responses.JSONResponse( status_code=500, content={'message': f'Internal server error: {error}'} diff --git a/src/utils/connectors/postgres.py b/src/utils/connectors/postgres.py index d12d68d6f..0c5d7571f 100644 --- a/src/utils/connectors/postgres.py +++ b/src/utils/connectors/postgres.py @@ -603,6 +603,25 @@ def get_dataset_configs(self) -> 'DatasetConfig': def get_method(self) -> Optional[Literal['dev']]: return self.config.method + @staticmethod + def _is_jwe_compact(value: str) -> bool: + """Check if a string looks like a JWE compact serialization. + + JWE compact format has 5 base64url segments separated by dots: + header.encryptedKey.iv.ciphertext.tag + The header is base64url-encoded JSON starting with '{"alg":' which + encodes to 'eyJ'. + + This distinguishes JWE from JWS/JWT (3 dots) and plain JSON (0 dots). + + NOTE: This is a shape-based heuristic and could match JWE tokens from + external systems. A more robust approach would decode the JWE header + and check for an OSMO-specific marker (e.g. "osmo_encrypted": true), + but that requires migrating all existing encrypted data. See + https://github.com/NVIDIA/OSMO/issues/731 for follow-up. + """ + return isinstance(value, str) and value.startswith("eyJ") and value.count('.') == 4 + def decrypt_credential(self, db_row) -> Dict: result = {} payload = PostgresConnector.decode_hstore(db_row.payload) @@ -621,7 +640,15 @@ def decrypt_credential(self, db_row) -> Dict: encrypted, db_row.user_name, self.generate_update_secret_func(cmd, cmd_args)) result[key] = decrypted.value - except (JWException, osmo_errors.OSMONotFoundError): + except (JWException, osmo_errors.OSMONotFoundError) as error: + if self._is_jwe_compact(value): + logging.error( + "Cannot decrypt credential key '%s' for user '%s' " + "with current MEK: key material mismatch. " + "See https://github.com/NVIDIA/OSMO/issues/731", + key, db_row.user_name) + result[key] = '' # Return empty, service stays alive + continue result[key] = value encrypted = self.secret_manager.encrypt(value, db_row.user_name) cmd = ( @@ -2632,6 +2659,7 @@ class Config: def deserialize(cls, config_dict: Dict, postgres: PostgresConnector): """ Decrypts all secrets in `config_dict` """ encrypt_keys = set() + delete_keys = set() # Keys with stale JWE to delete (triggers regeneration) # Define function to pass into secret_manager.decrypt to update secrets def re_encrypt(key: str, new_encrypted: List): @@ -2694,8 +2722,23 @@ def _decrypt(result_data: Any, if new_encrypted_list: new_encrypted = new_encrypted_list[0] return decrypted.value, new_encrypted - except (JWException, osmo_errors.OSMONotFoundError): - # Encrypt the plain text secret + except (JWException, osmo_errors.OSMONotFoundError) as error: + if PostgresConnector._is_jwe_compact(secret): + # Value is already JWE-encrypted but cannot be decrypted + # with the current MEK. This happens when the MEK ConfigMap + # is regenerated with new key material. Delete the stale + # config row so _init_configs() regenerates it with a fresh + # default on the next startup. + # See https://github.com/NVIDIA/OSMO/issues/731 + logging.error( + "Cannot decrypt config key '%s' with current MEK: " + "key material mismatch. Deleting stale config so " + "the service regenerates it on next startup. " + "See https://github.com/NVIDIA/OSMO/issues/731", + top_level_key) + delete_keys.add(top_level_key) + return '', None + # Genuinely unencrypted plaintext — encrypt it encrypted = postgres.secret_manager.encrypt(secret, '') encrypt_keys.add(top_level_key) return secret, encrypted.value @@ -2724,6 +2767,15 @@ def _decrypt(result_data: Any, new_value = json.dumps(encrypted_dict[key]) cmd = 'UPDATE configs SET value = %s WHERE key = %s AND value = %s;' postgres.execute_commit_command(cmd, (new_value, key, old_value)) + + # Delete configs with stale encryption — forces regeneration via + # _init_configs() → _set_default_config() → INSERT ON CONFLICT DO NOTHING. + # Must include type in WHERE clause — configs PK is (key, type). + config_type = dynamic_config.get_type().value + for key in delete_keys: + cmd = 'DELETE FROM configs WHERE key = %s AND type = %s;' + postgres.execute_commit_command(cmd, (key, config_type)) + return dynamic_config def serialize_helper(self, config_dict: Dict, postgres: PostgresConnector,