Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions deployments/charts/quick-start/templates/mek-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ spec:
exit 1
fi

# Generate a unique kid for this MEK to make key material mismatches
# detectable. A static kid (e.g. "key1") causes get_mek() to silently
# return the wrong key after MEK regeneration. See NVIDIA/OSMO#731.
KID="key-$(dd if=/dev/urandom bs=8 count=1 2>/dev/null | od -An -tx1 | tr -d ' \n')"

# Create JWK JSON structure and encode it
JWK_JSON='{"k":"'$RANDOM_KEY'","kid":"key1","kty":"oct"}'
JWK_JSON='{"k":"'$RANDOM_KEY'","kid":"'$KID'","kty":"oct"}'
ENCODED_JWK=$(echo -n "$JWK_JSON" | base64 | tr -d '\n')

# Get current timestamp
Expand All @@ -70,9 +75,9 @@ spec:
data:
mek.yaml: |
# MEK generated $TIMESTAMP
currentMek: key1
currentMek: $KID
meks:
key1: $ENCODED_JWK
$KID: $ENCODED_JWK
EOF

echo "Generated MEK ConfigMap"
Expand Down
30 changes: 20 additions & 10 deletions deployments/scripts/deploy-k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -270,24 +270,34 @@ create_secrets() {
$RUN_KUBECTL "delete secret redis-secret --namespace $OSMO_NAMESPACE --ignore-not-found=true"
$RUN_KUBECTL "create secret generic redis-secret --from-literal=redis-password=$REDIS_PASSWORD --namespace $OSMO_NAMESPACE"

# Generate and create MEK
log_info "Generating Master Encryption Key (MEK)..."
local random_key=$(openssl rand -base64 32 | tr -d '\n')
local jwk_json="{\"k\":\"$random_key\",\"kid\":\"key1\",\"kty\":\"oct\"}"
local encoded_jwk=$(echo -n "$jwk_json" | base64 | tr -d '\n')

local mek_manifest="apiVersion: v1
# Generate and create MEK (skip if already exists to avoid key material mismatch)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Take a look at deploy_service.rst to see if there are updates to the commands there

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated both docs/deployment_guide/getting_started/deploy_service.rst and docs/deployment_guide/appendix/deploy_minimal.rst to match the deploy script behavior:

  • Replaced all hardcoded "kid":"key1" with dynamic KID="key-$(openssl rand -hex 8)" generation
  • Updated ConfigMap YAML examples to use $KID variable
  • Added a .. note:: explaining why unique key IDs matter (detecting key material mismatches)
  • Added a .. tip:: warning not to re-create the MEK ConfigMap if it already exists

See commit ebd427b.

if $RUN_KUBECTL "get configmap mek-config -n $OSMO_NAMESPACE" >/dev/null 2>&1; then
log_info "MEK ConfigMap already exists, skipping generation"
else
log_info "Generating Master Encryption Key (MEK)..."
local random_key
random_key="$(openssl rand -base64 32 | tr -d '\n')"
# Use a unique kid per generation to make key material mismatches
# detectable. See https://github.com/NVIDIA/OSMO/issues/731
local kid
kid="key-$(openssl rand -hex 8)"
local jwk_json="{\"k\":\"$random_key\",\"kid\":\"$kid\",\"kty\":\"oct\"}"
local encoded_jwk
encoded_jwk="$(echo -n "$jwk_json" | base64 | tr -d '\n')"

local mek_manifest="apiVersion: v1
kind: ConfigMap
metadata:
name: mek-config
namespace: $OSMO_NAMESPACE
data:
mek.yaml: |
currentMek: key1
currentMek: $kid
meks:
key1: $encoded_jwk"
$kid: $encoded_jwk"

$RUN_KUBECTL_APPLY_STDIN "$mek_manifest"
$RUN_KUBECTL_APPLY_STDIN "$mek_manifest"
fi

log_success "Secrets created"
}
Expand Down
7 changes: 4 additions & 3 deletions run/start_service_kind.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,10 @@ def _generate_mek() -> None:

random_key = base64.b64encode(secrets.token_bytes(32)).decode('utf-8')

kid = f'key-{secrets.token_hex(8)}'
jwk_json = {
'k': random_key,
'kid': 'key1',
'kid': kid,
'kty': 'oct'
}

Expand All @@ -149,9 +150,9 @@ def _generate_mek() -> None:
data:
mek.yaml: |
# MEK generated {time.strftime('%Y-%m-%d %H:%M:%S')}
currentMek: key1
currentMek: {kid}
meks:
key1: {encoded_jwk}
{kid}: {encoded_jwk}
"""

with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file:
Expand Down
11 changes: 11 additions & 0 deletions src/service/core/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,17 @@ async def user_error_handler(request: fastapi.Request, error: osmo_errors.OSMOEr
async def top_level_exception_handler(request: fastapi.Request, error: Exception):
logging.exception('Got an exception of type %s on url path %s', type(error).__name__,
request.url.path)
# WebSocket connections cannot receive HTTP responses — attempting to send
# a JSONResponse on an accepted WebSocket triggers an ASGI protocol error:
# "Expected 'websocket.send' but got 'websocket.http.response.start'"
# Detect WebSocket scope and close gracefully instead.
if request.scope.get('type') == 'websocket':
try:
websocket = fastapi.WebSocket(request.scope, request.receive, request._send)
await websocket.close(code=1011, reason=str(error)[:123])
except Exception:
pass # Connection may already be closed
return None
return fastapi.responses.JSONResponse(
status_code=500,
content={'message': f'Internal server error: {error}'}
Expand Down
58 changes: 55 additions & 3 deletions src/utils/connectors/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,25 @@ def get_dataset_configs(self) -> 'DatasetConfig':
def get_method(self) -> Optional[Literal['dev']]:
return self.config.method

@staticmethod
def _is_jwe_compact(value: str) -> bool:
"""Check if a string looks like a JWE compact serialization.

JWE compact format has 5 base64url segments separated by dots:
header.encryptedKey.iv.ciphertext.tag
The header is base64url-encoded JSON starting with '{"alg":' which
encodes to 'eyJ'.

This distinguishes JWE from JWS/JWT (3 dots) and plain JSON (0 dots).

NOTE: This is a shape-based heuristic and could match JWE tokens from
external systems. A more robust approach would decode the JWE header
and check for an OSMO-specific marker (e.g. "osmo_encrypted": true),
but that requires migrating all existing encrypted data. See
https://github.com/NVIDIA/OSMO/issues/731 for follow-up.
"""
return isinstance(value, str) and value.startswith("eyJ") and value.count('.') == 4

def decrypt_credential(self, db_row) -> Dict:
result = {}
payload = PostgresConnector.decode_hstore(db_row.payload)
Expand All @@ -621,7 +640,15 @@ def decrypt_credential(self, db_row) -> Dict:
encrypted, db_row.user_name,
self.generate_update_secret_func(cmd, cmd_args))
result[key] = decrypted.value
except (JWException, osmo_errors.OSMONotFoundError):
except (JWException, osmo_errors.OSMONotFoundError) as error:
if self._is_jwe_compact(value):
logging.error(
"Cannot decrypt credential key '%s' for user '%s' "
"with current MEK: key material mismatch. "
"See https://github.com/NVIDIA/OSMO/issues/731",
key, db_row.user_name)
result[key] = '' # Return empty, service stays alive
continue
result[key] = value
encrypted = self.secret_manager.encrypt(value, db_row.user_name)
cmd = (
Expand Down Expand Up @@ -2632,6 +2659,7 @@ class Config:
def deserialize(cls, config_dict: Dict, postgres: PostgresConnector):
""" Decrypts all secrets in `config_dict` """
encrypt_keys = set()
delete_keys = set() # Keys with stale JWE to delete (triggers regeneration)

# Define function to pass into secret_manager.decrypt to update secrets
def re_encrypt(key: str, new_encrypted: List):
Expand Down Expand Up @@ -2694,8 +2722,23 @@ def _decrypt(result_data: Any,
if new_encrypted_list:
new_encrypted = new_encrypted_list[0]
return decrypted.value, new_encrypted
except (JWException, osmo_errors.OSMONotFoundError):
# Encrypt the plain text secret
except (JWException, osmo_errors.OSMONotFoundError) as error:
if PostgresConnector._is_jwe_compact(secret):
# Value is already JWE-encrypted but cannot be decrypted
# with the current MEK. This happens when the MEK ConfigMap
# is regenerated with new key material. Delete the stale
# config row so _init_configs() regenerates it with a fresh
# default on the next startup.
# See https://github.com/NVIDIA/OSMO/issues/731
logging.error(
"Cannot decrypt config key '%s' with current MEK: "
"key material mismatch. Deleting stale config so "
"the service regenerates it on next startup. "
"See https://github.com/NVIDIA/OSMO/issues/731",
top_level_key)
delete_keys.add(top_level_key)
return '', None
# Genuinely unencrypted plaintext — encrypt it
encrypted = postgres.secret_manager.encrypt(secret, '')
encrypt_keys.add(top_level_key)
return secret, encrypted.value
Expand Down Expand Up @@ -2724,6 +2767,15 @@ def _decrypt(result_data: Any,
new_value = json.dumps(encrypted_dict[key])
cmd = 'UPDATE configs SET value = %s WHERE key = %s AND value = %s;'
postgres.execute_commit_command(cmd, (new_value, key, old_value))

# Delete configs with stale encryption — forces regeneration via
# _init_configs() → _set_default_config() → INSERT ON CONFLICT DO NOTHING.
# Must include type in WHERE clause — configs PK is (key, type).
config_type = dynamic_config.get_type().value
for key in delete_keys:
cmd = 'DELETE FROM configs WHERE key = %s AND type = %s;'
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider batching the keys into one SQL call, instead of making a call per key

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Batched into a single ANY(%s) call, matching the existing pattern in _batch_fetch_external_roles:

if delete_keys:
    config_type = dynamic_config.get_type().value
    cmd = 'DELETE FROM configs WHERE key = ANY(%s) AND type = %s;'
    postgres.execute_commit_command(cmd, (list(delete_keys), config_type))

The encrypt_keys UPDATE loop above is left as-is since each key has a different old_value for optimistic concurrency, making it unsuitable for simple batching.

See commit ebd427b.

postgres.execute_commit_command(cmd, (key, config_type))

return dynamic_config

def serialize_helper(self, config_dict: Dict, postgres: PostgresConnector,
Expand Down