diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..619260cee36d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -462,6 +464,43 @@ public Boolean notIn(BoundReference ref, Set literalSet) { @Override public Boolean startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (canContainNulls(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(lower.subSequence(0, prefix.length()), prefix) == 0) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(upper.subSequence(0, prefix.length()), prefix) == 0) { + // both bounds start with the prefix, so all rows must start with the prefix + return ROWS_MUST_MATCH; + } + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..ef2bae2b89bf 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -33,6 +33,7 @@ import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -684,4 +685,114 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + + @Test + void testStartsWithBothBoundsMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: both bounds start with the prefix").isTrue(); + } + + @Test + void testStartsWithSingleCharPrefixBothBoundsMatch() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should match: both bounds start with the single char prefix") + .isTrue(); + } + + @Test + void testStartsWithOnlyLowerBoundMatchesPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + } + + @Test + void testStartsWithBoundsDoNotMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithNoStats() { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + void testStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead) + .as("Should not match: all null values do not satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithSomeNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("some_nulls", "b")).eval(FILE_2); + assertThat(shouldRead) + .as("Should not match: some nulls means not all rows can satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix is longer than the bounds").isFalse(); + } }