-
Notifications
You must be signed in to change notification settings - Fork 30
fix: prevent MEK re-encryption from double-wrapping JWE config values #732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
c77a4e0
ad47c7b
222d2c7
46f557e
feabab4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -603,6 +603,25 @@ def get_dataset_configs(self) -> 'DatasetConfig': | |
| def get_method(self) -> Optional[Literal['dev']]: | ||
| return self.config.method | ||
|
|
||
| @staticmethod | ||
| def _is_jwe_compact(value: str) -> bool: | ||
| """Check if a string looks like a JWE compact serialization. | ||
|
|
||
| JWE compact format has 5 base64url segments separated by dots: | ||
| header.encryptedKey.iv.ciphertext.tag | ||
| The header is base64url-encoded JSON starting with '{"alg":' which | ||
| encodes to 'eyJ'. | ||
|
|
||
| This distinguishes JWE from JWS/JWT (3 dots) and plain JSON (0 dots). | ||
|
|
||
| NOTE: This is a shape-based heuristic and could match JWE tokens from | ||
| external systems. A more robust approach would decode the JWE header | ||
| and check for an OSMO-specific marker (e.g. "osmo_encrypted": true), | ||
| but that requires migrating all existing encrypted data. See | ||
| https://github.com/NVIDIA/OSMO/issues/731 for follow-up. | ||
| """ | ||
| return isinstance(value, str) and value.startswith("eyJ") and value.count('.') == 4 | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def decrypt_credential(self, db_row) -> Dict: | ||
| result = {} | ||
| payload = PostgresConnector.decode_hstore(db_row.payload) | ||
|
|
@@ -621,7 +640,15 @@ def decrypt_credential(self, db_row) -> Dict: | |
| encrypted, db_row.user_name, | ||
| self.generate_update_secret_func(cmd, cmd_args)) | ||
| result[key] = decrypted.value | ||
| except (JWException, osmo_errors.OSMONotFoundError): | ||
| except (JWException, osmo_errors.OSMONotFoundError) as error: | ||
| if self._is_jwe_compact(value): | ||
| logging.error( | ||
| "Cannot decrypt credential key '%s' for user '%s' " | ||
| "with current MEK: key material mismatch. " | ||
| "See https://github.com/NVIDIA/OSMO/issues/731", | ||
| key, db_row.user_name) | ||
| result[key] = '' # Return empty, service stays alive | ||
| continue | ||
| result[key] = value | ||
| encrypted = self.secret_manager.encrypt(value, db_row.user_name) | ||
| cmd = ( | ||
|
|
@@ -2632,6 +2659,7 @@ class Config: | |
| def deserialize(cls, config_dict: Dict, postgres: PostgresConnector): | ||
| """ Decrypts all secrets in `config_dict` """ | ||
| encrypt_keys = set() | ||
| delete_keys = set() # Keys with stale JWE to delete (triggers regeneration) | ||
|
|
||
| # Define function to pass into secret_manager.decrypt to update secrets | ||
| def re_encrypt(key: str, new_encrypted: List): | ||
|
|
@@ -2694,8 +2722,23 @@ def _decrypt(result_data: Any, | |
| if new_encrypted_list: | ||
| new_encrypted = new_encrypted_list[0] | ||
| return decrypted.value, new_encrypted | ||
| except (JWException, osmo_errors.OSMONotFoundError): | ||
| # Encrypt the plain text secret | ||
| except (JWException, osmo_errors.OSMONotFoundError) as error: | ||
| if PostgresConnector._is_jwe_compact(secret): | ||
| # Value is already JWE-encrypted but cannot be decrypted | ||
| # with the current MEK. This happens when the MEK ConfigMap | ||
| # is regenerated with new key material. Delete the stale | ||
| # config row so _init_configs() regenerates it with a fresh | ||
| # default on the next startup. | ||
| # See https://github.com/NVIDIA/OSMO/issues/731 | ||
| logging.error( | ||
| "Cannot decrypt config key '%s' with current MEK: " | ||
| "key material mismatch. Deleting stale config so " | ||
| "the service regenerates it on next startup. " | ||
| "See https://github.com/NVIDIA/OSMO/issues/731", | ||
| top_level_key) | ||
| delete_keys.add(top_level_key) | ||
| return '', None | ||
| # Genuinely unencrypted plaintext — encrypt it | ||
| encrypted = postgres.secret_manager.encrypt(secret, '') | ||
| encrypt_keys.add(top_level_key) | ||
| return secret, encrypted.value | ||
|
|
@@ -2724,6 +2767,15 @@ def _decrypt(result_data: Any, | |
| new_value = json.dumps(encrypted_dict[key]) | ||
| cmd = 'UPDATE configs SET value = %s WHERE key = %s AND value = %s;' | ||
| postgres.execute_commit_command(cmd, (new_value, key, old_value)) | ||
|
|
||
| # Delete configs with stale encryption — forces regeneration via | ||
| # _init_configs() → _set_default_config() → INSERT ON CONFLICT DO NOTHING. | ||
| # Must include type in WHERE clause — configs PK is (key, type). | ||
| config_type = dynamic_config.get_type().value | ||
| for key in delete_keys: | ||
| cmd = 'DELETE FROM configs WHERE key = %s AND type = %s;' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider batching the keys into one SQL call, instead of making a call per key
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Batched into a single if delete_keys:
config_type = dynamic_config.get_type().value
cmd = 'DELETE FROM configs WHERE key = ANY(%s) AND type = %s;'
postgres.execute_commit_command(cmd, (list(delete_keys), config_type))The See commit ebd427b. |
||
| postgres.execute_commit_command(cmd, (key, config_type)) | ||
|
|
||
| return dynamic_config | ||
|
|
||
| def serialize_helper(self, config_dict: Dict, postgres: PostgresConnector, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Take a look at
deploy_service.rstto see if there are updates to the commands thereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated both
docs/deployment_guide/getting_started/deploy_service.rstanddocs/deployment_guide/appendix/deploy_minimal.rstto match the deploy script behavior:"kid":"key1"with dynamicKID="key-$(openssl rand -hex 8)"generation$KIDvariable.. note::explaining why unique key IDs matter (detecting key material mismatches).. tip::warning not to re-create the MEK ConfigMap if it already existsSee commit ebd427b.