Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions src/sentry/grouping/parameterization.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,42 @@
import dataclasses
import logging
import re
from collections import OrderedDict, defaultdict
from collections import Counter, OrderedDict, defaultdict
from collections.abc import Sequence
from ipaddress import ip_address, ip_interface, ip_network
from typing import Callable
from typing import Any, Callable

from sentry.utils import metrics

logger = logging.getLogger("sentry.events.grouping")


# Counter for logging a set amount of example data. Not meant to be used directly. (Use the
# `_log_example_data` helper instead.)
LOGGING_COUNTER: Counter[str] = Counter()

# Function parameterization regexes can specify to provide a customized replacement string. Can also
# be used to do conditional replacement, by returning the original value in cases where replacement
# shouldn't happen.
ParameterizationReplacementFunction = Callable[[str], str]


# Log examples, up to the given limit.
def _log_example_data(
key: str, # Key used for tracking log count and in logger `event` string
extra: dict[str, Any], # Extra data to add to the log (should include example data)
limit: int = 100, # Number of logs to be gathered per deployment
) -> None:
# Note: In a multi-threaded environment, it's possible to run into a race condition where
# multiple threads are simultaneously logging what should theoretically be the last example. As
# result, we may end up logging a few more examples than the given limit. To fix this, we'd need
# to wrap everything here in a lock, but given that a few extra logs don't hurt anything, it's
# not worth blocking ingest by doing so.
if LOGGING_COUNTER[key] < limit:
logger.info(f"grouping.parameterization.{key}", extra=extra)
LOGGING_COUNTER[key] += 1


@dataclasses.dataclass
class ParameterizationRegex:
name: str # name of the pattern (also used as group name in combined regex)
Expand Down Expand Up @@ -445,6 +469,10 @@ def _handle_regex_match(match: re.Match[str]) -> str:
metrics.incr(
"grouping.parameterization_false_positive", tags={"key": matched_key}
)
# TODO: Remove this once we have enough sample data
_log_example_data(
"ip_false_positive", extra={"input_str": input_str, "value": orig_value}
)

return replacement_string

Expand Down
31 changes: 31 additions & 0 deletions tests/sentry/grouping/test_parameterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sentry.grouping.parameterization import (
ParameterizationRegex,
Parameterizer,
_log_example_data,
experimental_parameterizer,
is_valid_ip,
parameterizer,
Expand Down Expand Up @@ -717,3 +718,33 @@ def test_replacement_callback_false_positive_triggers_individual_regex_fallback(
)
== 1
)


@patch("sentry.grouping.parameterization.logger")
def test_example_data_logging(mock_logger: MagicMock) -> None:
for i in range(15):
_log_example_data("dog_fact_1", extra={"input_str": "dogs are great", "num": i}, limit=10)

for i in range(105):
_log_example_data("dog_fact_2", extra={"input_str": "all dogs are good dogs", "num": i})

# In the first loop, we specified a limit of 10, so the logger was called 10 times, even though
# we called the helper 15 times
assert (
count_matching_calls(
mock_logger.info,
"grouping.parameterization.dog_fact_1",
extra={"input_str": "dogs are great", "num": ANY},
)
== 10
)
# In the second loop, we didn't specify a limit, so the logger was called 100 times (the
# default limit), even though we called the helper 105 times
assert (
count_matching_calls(
mock_logger.info,
"grouping.parameterization.dog_fact_2",
extra={"input_str": "all dogs are good dogs", "num": ANY},
)
== 100
)
Loading