From c4049bf161cfd631bcc979b5c8e6342d9e511bbb Mon Sep 17 00:00:00 2001 From: Hrushikesh Sahasrabuddhe Date: Wed, 25 Mar 2026 13:39:21 -0700 Subject: [PATCH 1/3] add sanitize_on_write mode for safe JSON serialization of complex object Adds an optional `sanitize_on_write` flag to JSONStore to enable safe serialization of complex Python/MSONable objects (e.g., numpy types, pymatgen objects, jobflow blobs) to JSON. When enabled: - Data is first processed using `monty.json.jsanitize` with `recursive_msonable=True` - Serialization uses Python's built-in `json.dump` instead of `orjson.dumps` to avoid strict type limitations (e.g., ScalarFloat) Motivation: JSONStore currently fails when storing complex nested objects such as jobflow blob data due to `orjson` serialization constraints. This is a common use case when using JSONStore as a lightweight alternative to MongoDB/GridFS for demos, youtube tutorials, etc. This change preserves existing behavior by default and only activates the safer (but slower) serialization path when explicitly requested. Additional info: - jobflow phonon workflows (atomate2) - generates large blob documents (~100 MB) --- src/maggma/stores/mongolike.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/maggma/stores/mongolike.py b/src/maggma/stores/mongolike.py index b6073d77a..f470f6e02 100644 --- a/src/maggma/stores/mongolike.py +++ b/src/maggma/stores/mongolike.py @@ -4,6 +4,7 @@ various utilities. """ +import json import warnings from collections.abc import Iterator from itertools import chain, groupby @@ -615,6 +616,7 @@ def __init__( serialization_option: Optional[int] = None, serialization_default: Optional[Callable[[Any], Any]] = None, encoding: Optional[str] = None, + sanitize_on_write: bool = False, **kwargs, ): """ @@ -669,6 +671,7 @@ def __init__( self.default_sort = None self.serialization_option = serialization_option self.serialization_default = serialization_default + self.sanitize_on_write = sanitize_on_write super().__init__(**kwargs) @@ -767,12 +770,20 @@ def update_json_file(self): data = list(self.query()) for d in data: d.pop("_id") - bytesdata = orjson.dumps( - data, - option=self.serialization_option, - default=self.serialization_default, - ) - f.write(bytesdata.decode("utf-8")) + if self.sanitize_on_write: + data = jsanitize( + data, + strict=False, + recursive_msonable=True, + ) + json.dump(data, f, indent=2) + else: + bytesdata = orjson.dumps( + data, + option=self.serialization_option, + default=self.serialization_default, + ) + f.write(bytesdata.decode("utf-8")) def __hash__(self): return hash((*self.paths, self.last_updated_field)) From fab9d27ea51dc43384e427f0f8eb6b7d76d2759a Mon Sep 17 00:00:00 2001 From: Hrushikesh Sahasrabuddhe Date: Wed, 25 Mar 2026 13:41:44 -0700 Subject: [PATCH 2/3] added docstring --- src/maggma/stores/mongolike.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/maggma/stores/mongolike.py b/src/maggma/stores/mongolike.py index f470f6e02..b56941826 100644 --- a/src/maggma/stores/mongolike.py +++ b/src/maggma/stores/mongolike.py @@ -642,6 +642,8 @@ def __init__( encoding from the platform. This should work in the great majority of cases. However, if you encounter a UnicodeDecodeError, consider setting the encoding explicitly to 'utf8' or another encoding as appropriate. + sanitize_on_write: Whether to sanitize documents with jsanitize before writing to the + JSON file. """ paths = paths if isinstance(paths, (list, tuple)) else [paths] self.paths = paths From d90535b7abeeca445eb6c7c7e6750b47e2f8e521 Mon Sep 17 00:00:00 2001 From: Hrushikesh Sahasrabuddhe Date: Wed, 25 Mar 2026 14:15:08 -0700 Subject: [PATCH 3/3] test: add coverage for JSONStore sanitize_on_write behavior This test verifies that: - Non-JSON-serializable objects (e.g., custom float subclasses) are properly sanitized - Data is successfully written to disk without serialization errors - Stored values are converted into JSON-compatible types --- tests/stores/test_mongolike.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/stores/test_mongolike.py b/tests/stores/test_mongolike.py index 997b7edde..259a59060 100644 --- a/tests/stores/test_mongolike.py +++ b/tests/stores/test_mongolike.py @@ -551,6 +551,31 @@ def test_jsonstore_last_updated(test_dir): assert jsonstore.last_updated > start_time +def test_jsonstore_sanitize_on_write(test_dir): + class SubFloat(float): + pass + + with ScratchDir("."): + jsonstore = JSONStore( + "sanitize.json", + read_only=False, + sanitize_on_write=True, + ) + jsonstore.connect() + + # This would fail on the normal orjson path, but should succeed when + # sanitize_on_write=True. + jsonstore.update({"wrong_field": SubFloat(1.1), "task_id": 3}) + jsonstore.close() + + # Confirm the file was written and can be reloaded. + jsonstore = JSONStore("sanitize.json", read_only=True) + jsonstore.connect() + doc = jsonstore.query_one(criteria={"task_id": 3}) + assert doc is not None + assert doc["wrong_field"] == pytest.approx(1.1) + + def test_eq(mongostore, memorystore, jsonstore): assert mongostore == mongostore assert memorystore == memorystore