From 5a0979989c76aefd217b065b3256fcc8668f42f3 Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Fri, 3 Apr 2026 17:29:32 -0700 Subject: [PATCH] API: Use column bounds to evaluate startsWith in StrictMetricsEvaluator StrictMetricsEvaluator.startsWith() previously returned ROWS_MIGHT_NOT_MATCH unconditionally, without using column bounds. This misses the opportunity to determine that all rows in a file must match when both lower and upper bounds start with the given prefix. When both bounds start with the prefix, all values between them must also start with it, so the evaluator can return ROWS_MUST_MATCH. This enables whole-file matching optimizations for prefix-based operations. --- .../expressions/StrictMetricsEvaluator.java | 39 ++++++ .../TestStrictMetricsEvaluator.java | 111 ++++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..619260cee36d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -462,6 +464,43 @@ public Boolean notIn(BoundReference ref, Set literalSet) { @Override public Boolean startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (canContainNulls(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(lower.subSequence(0, prefix.length()), prefix) == 0) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(upper.subSequence(0, prefix.length()), prefix) == 0) { + // both bounds start with the prefix, so all rows must start with the prefix + return ROWS_MUST_MATCH; + } + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..ef2bae2b89bf 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -33,6 +33,7 @@ import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -684,4 +685,114 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + + @Test + void testStartsWithBothBoundsMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: both bounds start with the prefix").isTrue(); + } + + @Test + void testStartsWithSingleCharPrefixBothBoundsMatch() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should match: both bounds start with the single char prefix") + .isTrue(); + } + + @Test + void testStartsWithOnlyLowerBoundMatchesPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + } + + @Test + void testStartsWithBoundsDoNotMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithNoStats() { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + void testStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead) + .as("Should not match: all null values do not satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithSomeNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("some_nulls", "b")).eval(FILE_2); + assertThat(shouldRead) + .as("Should not match: some nulls means not all rows can satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix is longer than the bounds").isFalse(); + } }