facebookincubator · yaooqinn · Mar 28, 2026 · Apr 1, 2026 · rui-mo · Mar 31, 2026
@@ -161,19 +161,10 @@ class SparkCollectSetAggregate
       SBase::clearNull(group);
       auto tracker = SBase::trackRowSize(group);
       auto decodedIndex = SBase::decoded_.index(i);
-      if (ignoreNulls_) {
-        SBase::value(group)->addNonNullValues(
-            *baseArray,
-            decodedIndex,
-            SBase::decodedElements_,
-            SBase::allocator_);
-      } else {
-        SBase::value(group)->addValues(
-            *baseArray,
-            decodedIndex,
-            SBase::decodedElements_,
-            SBase::allocator_);
-      }
+      // Intermediate results already have null filtering applied by the
+      // partial step. Always preserve all elements (including nulls) here.
+      SBase::value(group)->addValues(
+          *baseArray, decodedIndex, SBase::decodedElements_, SBase::allocator_);
     });
   }
 
@@ -193,29 +184,19 @@ class SparkCollectSetAggregate
       }
       SBase::clearNull(group);
       auto decodedIndex = SBase::decoded_.index(i);
-      if (ignoreNulls_) {
-        accumulator->addNonNullValues(
-            *baseArray,
-            decodedIndex,
-            SBase::decodedElements_,
-            SBase::allocator_);
-      } else {
-        accumulator->addValues(
-            *baseArray,
-            decodedIndex,
-            SBase::decodedElements_,
-            SBase::allocator_);
-      }
+      // Intermediate results already have null filtering applied by the
+      // partial step. Always preserve all elements (including nulls) here.
+      accumulator->addValues(
+          *baseArray, decodedIndex, SBase::decodedElements_, SBase::allocator_);
     });
   }
 
  private:
-  // Initialized via setConstantInputs() from the constant boolean argument.
-  // Default is false (conservative: keeps nulls). In partial+final mode,
-  // the final node doesn't receive the boolean constant, so it uses this
-  // default — which is safe because the partial node already handles null
-  // filtering based on the actual constant value.
-  bool ignoreNulls_{false};
+  // Default to true (Spark's default: IGNORE NULLS). Updated by
+  // setConstantInputs() when a 2-arg signature provides explicit value.
+  // Only used in addRawInput (partial/single step); intermediate/final
+  // steps always preserve all elements from the partial output.
+  bool ignoreNulls_{true};
 };
 
 std::unique_ptr<exec::Aggregate> createSetAgg(

@@ -306,6 +306,33 @@ TEST_F(CollectSetAggregateTest, unknownType) {
   testAggregations({data}, {"c0"}, {"collect_set(c0, true)"}, {}, {expected});
 }
 
+// Verify the 1-arg collect_set(c0) defaults to ignoring nulls (Spark default).
+TEST_F(CollectSetAggregateTest, defaultIgnoreNulls) {
+  auto data = makeRowVector({
+      makeNullableFlatVector<int32_t>(
+          {1, 2, std::nullopt, 4, 5, std::nullopt, 4, 2}),
+  });
+
+  auto expected = makeRowVector({
+      makeArrayVectorFromJson<int32_t>({"[1, 2, 4, 5]"}),
+  });
+
+  // 1-arg signature: no explicit ignoreNulls boolean.
+  testAggregations(
+      {data}, {}, {"collect_set(c0)"}, {"spark_array_sort(a0)"}, {expected});
+
+  // All null inputs — returns empty array (nulls ignored).
+  data = makeRowVector({
+      makeAllNullFlatVector<int32_t>(5),
+  });
+
+  expected = makeRowVector({
+      makeArrayVectorFromJson<int32_t>({"[]"}),
+  });
+
+  testAggregations({data}, {}, {"collect_set(c0)"}, {}, {expected});
+}
+
 // Verify that collect_set(c0, true) correctly ignores null inputs.
 TEST_F(CollectSetAggregateTest, explicitIgnoreNullsTrue) {
   auto data = makeRowVector({