Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@

import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
import org.apache.iceberg.types.Comparators;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.NaNUtil;
Expand Down Expand Up @@ -462,6 +464,43 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {

@Override
public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
int id = ref.fieldId();
if (isNestedColumn(id)) {
return ROWS_MIGHT_NOT_MATCH;
}

if (canContainNulls(id)) {
return ROWS_MIGHT_NOT_MATCH;
}

String prefix = (String) lit.value();
Comparator<CharSequence> comparator = Comparators.charSequences();

if (lowerBounds != null
&& lowerBounds.containsKey(id)
&& upperBounds != null
&& upperBounds.containsKey(id)) {
CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));

// if lower is shorter than the prefix then lower doesn't start with the prefix
if (lower.length() < prefix.length()) {
return ROWS_MIGHT_NOT_MATCH;
}

if (comparator.compare(lower.subSequence(0, prefix.length()), prefix) == 0) {
// if upper is shorter than the prefix then upper can't start with the prefix
if (upper.length() < prefix.length()) {
return ROWS_MIGHT_NOT_MATCH;
}

if (comparator.compare(upper.subSequence(0, prefix.length()), prefix) == 0) {
// both bounds start with the prefix, so all rows must start with the prefix
return ROWS_MUST_MATCH;
}
}
}

return ROWS_MIGHT_NOT_MATCH;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import static org.apache.iceberg.expressions.Expressions.notNaN;
import static org.apache.iceberg.expressions.Expressions.notNull;
import static org.apache.iceberg.expressions.Expressions.or;
import static org.apache.iceberg.expressions.Expressions.startsWith;
import static org.apache.iceberg.types.Conversions.toByteBuffer;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
Expand Down Expand Up @@ -684,4 +685,114 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not match").isFalse();
}

// String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"]
private static final DataFile STRING_FILE =
new TestDataFile(
"string_file.avro",
Row.of(),
50,
// any value counts, including nulls
ImmutableMap.of(3, 50L),
// null value counts
ImmutableMap.of(),
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd")));

// String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"]
private static final DataFile STRING_FILE_2 =
new TestDataFile(
"string_file_2.avro",
Row.of(),
50,
// any value counts, including nulls
ImmutableMap.of(3, 50L),
// null value counts
ImmutableMap.of(),
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC")));

@Test
void testStartsWithBothBoundsMatchPrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "ab")).eval(STRING_FILE);
assertThat(shouldRead).as("Should match: both bounds start with the prefix").isTrue();
}

@Test
void testStartsWithSingleCharPrefixBothBoundsMatch() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE);
assertThat(shouldRead)
.as("Should match: both bounds start with the single char prefix")
.isTrue();
}

@Test
void testStartsWithOnlyLowerBoundMatchesPrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abc")).eval(STRING_FILE);
assertThat(shouldRead)
.as("Should not match: upper bound does not start with the prefix")
.isFalse();
}

@Test
void testStartsWithBoundsDoNotMatchPrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "zzz")).eval(STRING_FILE);
assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse();
}

@Test
void testStartsWithWiderRange() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE_2);
assertThat(shouldRead)
.as("Should not match: upper bound does not start with the prefix")
.isFalse();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "e")).eval(STRING_FILE_2);
assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse();
}

@Test
void testStartsWithNoStats() {
boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(FILE);
assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse();
}

@Test
void testStartsWithAllNulls() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE);
assertThat(shouldRead)
.as("Should not match: all null values do not satisfy startsWith")
.isFalse();
}

@Test
void testStartsWithSomeNulls() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("some_nulls", "b")).eval(FILE_2);
assertThat(shouldRead)
.as("Should not match: some nulls means not all rows can satisfy startsWith")
.isFalse();
}

@Test
void testStartsWithPrefixLongerThanBounds() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abcdef")).eval(STRING_FILE);
assertThat(shouldRead).as("Should not match: prefix is longer than the bounds").isFalse();
}
}
Loading