Skip to content

Commit fe241e5

Browse files
authored
Allow early exit for left semi-/anti- joins with empty build/probe tables (#20452)
Closes #20410 Adds early exit condition in filtered join operations when input tables are empty. Includes test coverage for semi- and anti- join operations with empty table scenarios. Authors: - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: #20452
1 parent 3bfeeb8 commit fe241e5

File tree

2 files changed

+77
-0
lines changed

2 files changed

+77
-0
lines changed

cpp/src/join/filtered_join.cu

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <cuco/static_set_ref.cuh>
2929
#include <cuda/std/iterator>
3030
#include <thrust/iterator/counting_iterator.h>
31+
#include <thrust/sequence.h>
3132

3233
namespace cudf {
3334
namespace detail {
@@ -218,6 +219,7 @@ filtered_join::filtered_join(cudf::table_view const& build,
218219
rmm::mr::polymorphic_allocator<char>{},
219220
stream.value()}
220221
{
222+
if (_build.num_rows() == 0) return;
221223
_bucket_storage.initialize(empty_sentinel_key, stream);
222224
}
223225

@@ -228,6 +230,7 @@ distinct_filtered_join::distinct_filtered_join(cudf::table_view const& build,
228230
: filtered_join(build, compare_nulls, load_factor, stream)
229231
{
230232
cudf::scoped_range range{"distinct_filtered_join::distinct_filtered_join"};
233+
if (_build.num_rows() == 0) return;
231234
// Any mismatch in nullate between probe and build row operators results in UB. Ideally, nullate
232235
// should be determined by the logical OR of probe nulls and build nulls. However, since we do not
233236
// know if the probe has nulls apriori, we set nullate::DYNAMIC{true} (in the case of primitive
@@ -334,12 +337,28 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> distinct_filtered_join::se
334337
std::unique_ptr<rmm::device_uvector<cudf::size_type>> distinct_filtered_join::semi_join(
335338
cudf::table_view const& probe, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
336339
{
340+
// Early return for empty build or probe table
341+
if (_build.num_rows() == 0 || probe.num_rows() == 0) {
342+
return std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream, mr);
343+
}
344+
337345
return semi_anti_join(probe, join_kind::LEFT_SEMI_JOIN, stream, mr);
338346
}
339347

340348
std::unique_ptr<rmm::device_uvector<cudf::size_type>> distinct_filtered_join::anti_join(
341349
cudf::table_view const& probe, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
342350
{
351+
// Early return for empty probe table
352+
if (probe.num_rows() == 0) {
353+
return std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream, mr);
354+
}
355+
if (_build.num_rows() == 0) {
356+
auto result =
357+
std::make_unique<rmm::device_uvector<cudf::size_type>>(probe.num_rows(), stream, mr);
358+
thrust::sequence(rmm::exec_policy_nosync(stream), result->begin(), result->end());
359+
return result;
360+
}
361+
343362
return semi_anti_join(probe, join_kind::LEFT_ANTI_JOIN, stream, mr);
344363
}
345364

cpp/tests/join/semi_anti_join_tests.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,3 +322,61 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNullsOnOneSide)
322322
auto expected = cudf::gather(left, expected_indices_col);
323323
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
324324
}
325+
326+
TEST_F(JoinTest, AntiJoinEmptyTables)
327+
{
328+
cudf::table empty_build_table{};
329+
cudf::table empty_probe_table{};
330+
column_wrapper<int32_t> col{0, 1, 2};
331+
auto nonempty_table = cudf::table_view{{col}};
332+
// Empty build and probe tables
333+
{
334+
auto result = left_anti_join(empty_probe_table, empty_build_table, {}, {});
335+
auto expected_indices_col = column_wrapper<cudf::size_type>{};
336+
auto expected = cudf::gather(empty_probe_table, expected_indices_col);
337+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
338+
}
339+
// Empty build table
340+
{
341+
auto result = left_anti_join(nonempty_table, empty_build_table, {0}, {});
342+
auto expected_indices_col = column_wrapper<cudf::size_type>{0, 1, 2};
343+
auto expected = cudf::gather(nonempty_table, expected_indices_col);
344+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
345+
}
346+
// Empty probe table
347+
{
348+
auto result = left_anti_join(empty_probe_table, nonempty_table, {}, {0});
349+
auto expected_indices_col = column_wrapper<cudf::size_type>{};
350+
auto expected = cudf::gather(empty_probe_table, expected_indices_col);
351+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
352+
}
353+
}
354+
355+
TEST_F(JoinTest, SemiJoinEmptyTables)
356+
{
357+
cudf::table empty_build_table{};
358+
cudf::table empty_probe_table{};
359+
column_wrapper<int32_t> col{0, 1, 2};
360+
auto nonempty_table = cudf::table_view{{col}};
361+
// Empty build and probe tables
362+
{
363+
auto result = left_semi_join(empty_probe_table, empty_build_table, {}, {});
364+
auto expected_indices_col = column_wrapper<cudf::size_type>{};
365+
auto expected = cudf::gather(empty_probe_table, expected_indices_col);
366+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
367+
}
368+
// Empty build table
369+
{
370+
auto result = left_semi_join(nonempty_table, empty_build_table, {0}, {});
371+
auto expected_indices_col = column_wrapper<cudf::size_type>{};
372+
auto expected = cudf::gather(empty_probe_table, expected_indices_col);
373+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
374+
}
375+
// Empty probe table
376+
{
377+
auto result = left_semi_join(empty_probe_table, nonempty_table, {}, {0});
378+
auto expected_indices_col = column_wrapper<cudf::size_type>{};
379+
auto expected = cudf::gather(empty_probe_table, expected_indices_col);
380+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected, *result);
381+
}
382+
}

0 commit comments

Comments
 (0)