Skip to content

Commit 5f2b4e0

Browse files
authored
Make row hasher 64-bit hashing compatible (rapidsai#20777)
Closes: rapidsai#20757 This PR modifies the normal row hasher and the primitive row hasher to enable compatibility with 64-bit hashing. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: rapidsai#20777
1 parent db57900 commit 5f2b4e0

File tree

4 files changed

+108
-44
lines changed

4 files changed

+108
-44
lines changed

cpp/include/cudf/detail/row_operator/hashing.cuh

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cudf/utilities/type_dispatcher.hpp>
2323

2424
#include <cuda/std/limits>
25+
#include <cuda/std/type_traits>
2526
#include <thrust/iterator/transform_iterator.h>
2627

2728
#include <memory>
@@ -38,6 +39,8 @@ namespace detail::row::hash {
3839
template <template <typename> class hash_function, typename Nullate>
3940
class element_hasher {
4041
public:
42+
using result_type = cuda::std::invoke_result_t<hash_function<int32_t>, int32_t>;
43+
4144
/**
4245
* @brief Constructs an element_hasher object.
4346
*
@@ -47,8 +50,8 @@ class element_hasher {
4750
*/
4851
__device__ element_hasher(
4952
Nullate nulls,
50-
uint32_t seed = DEFAULT_HASH_SEED,
51-
hash_value_type null_hash = cuda::std::numeric_limits<hash_value_type>::max()) noexcept
53+
result_type seed = DEFAULT_HASH_SEED,
54+
result_type null_hash = cuda::std::numeric_limits<result_type>::max()) noexcept
5255
: _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
5356
{
5457
}
@@ -62,8 +65,8 @@ class element_hasher {
6265
* @return The hash value of the given element
6366
*/
6467
template <typename T>
65-
__device__ hash_value_type operator()(column_device_view const& col,
66-
size_type row_index) const noexcept
68+
__device__ result_type operator()(column_device_view const& col,
69+
size_type row_index) const noexcept
6770
requires(column_device_view::has_element_accessor<T>())
6871
{
6972
if (_check_nulls && col.is_null(row_index)) { return _null_hash; }
@@ -79,16 +82,17 @@ class element_hasher {
7982
* @return The hash value of the given element
8083
*/
8184
template <typename T>
82-
__device__ hash_value_type operator()(column_device_view const& col,
83-
size_type row_index) const noexcept
85+
__device__ result_type operator()(column_device_view const& col,
86+
size_type row_index) const noexcept
8487
requires(not column_device_view::has_element_accessor<T>())
8588
{
8689
CUDF_UNREACHABLE("Unsupported type in hash.");
8790
}
8891

8992
Nullate _check_nulls;
90-
uint32_t _seed;
91-
hash_value_type _null_hash;
93+
// Assumes seeds are the same as the result type of the hash function
94+
result_type _seed;
95+
result_type _null_hash;
9296
};
9397

9498
/**
@@ -102,21 +106,20 @@ class device_row_hasher {
102106
friend class row_hasher;
103107

104108
public:
109+
using result_type = cuda::std::invoke_result_t<hash_function<int32_t>, int32_t>;
110+
105111
/**
106112
* @brief Return the hash value of a row in the given table.
107113
*
108114
* @param row_index The row index to compute the hash value of
109115
* @return The hash value of the row
110116
*/
111-
__device__ auto operator()(size_type row_index) const noexcept
117+
__device__ result_type operator()(size_type row_index) const noexcept
112118
{
113119
auto it =
114120
thrust::make_transform_iterator(_table.begin(), [row_index, this](auto const& column) {
115121
return cudf::type_dispatcher<dispatch_storage_type>(
116-
column.type(),
117-
element_hasher_adapter<hash_function>{_check_nulls, _seed},
118-
column,
119-
row_index);
122+
column.type(), element_hasher_adapter{_check_nulls, _seed}, column, row_index);
120123
});
121124

122125
return detail::accumulate(it, it + _table.num_columns(), _seed, [](auto hash, auto h) {
@@ -132,31 +135,30 @@ class device_row_hasher {
132135
* When the column is nested, this uses the element_hasher to hash the shape and values of the
133136
* column.
134137
*/
135-
template <template <typename> class hash_fn>
136138
class element_hasher_adapter {
137-
static constexpr hash_value_type NULL_HASH = cuda::std::numeric_limits<hash_value_type>::max();
138-
static constexpr hash_value_type NON_NULL_HASH = 0;
139+
static constexpr result_type NULL_HASH = cuda::std::numeric_limits<result_type>::max();
140+
static constexpr result_type NON_NULL_HASH = 0;
139141

140142
public:
141-
__device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
143+
__device__ element_hasher_adapter(Nullate check_nulls, result_type seed) noexcept
142144
: _element_hasher(check_nulls, seed), _check_nulls(check_nulls)
143145
{
144146
}
145147

146148
template <typename T>
147-
__device__ hash_value_type operator()(column_device_view const& col,
148-
size_type row_index) const noexcept
149+
__device__ result_type operator()(column_device_view const& col,
150+
size_type row_index) const noexcept
149151
requires(not cudf::is_nested<T>())
150152
{
151153
return _element_hasher.template operator()<T>(col, row_index);
152154
}
153155

154156
template <typename T>
155-
__device__ hash_value_type operator()(column_device_view const& col,
156-
size_type row_index) const noexcept
157+
__device__ result_type operator()(column_device_view const& col,
158+
size_type row_index) const noexcept
157159
requires(cudf::is_nested<T>())
158160
{
159-
auto hash = hash_value_type{0};
161+
auto hash = result_type{0};
160162
column_device_view curr_col = col.slice(row_index, 1);
161163
while (curr_col.type().id() == type_id::STRUCT || curr_col.type().id() == type_id::LIST) {
162164
if (_check_nulls) {
@@ -175,7 +177,7 @@ class device_row_hasher {
175177
auto list_sizes = make_list_size_iterator(list_col);
176178
hash = detail::accumulate(
177179
list_sizes, list_sizes + list_col.size(), hash, [](auto hash, auto size) {
178-
return cudf::hashing::detail::hash_combine(hash, hash_fn<size_type>{}(size));
180+
return cudf::hashing::detail::hash_combine(hash, hash_function<size_type>{}(size));
179181
});
180182
curr_col = list_col.get_sliced_child();
181183
}
@@ -188,20 +190,21 @@ class device_row_hasher {
188190
return hash;
189191
}
190192

191-
element_hasher<hash_fn, Nullate> const _element_hasher;
193+
element_hasher<hash_function, Nullate> const _element_hasher;
192194
Nullate const _check_nulls;
193195
};
194196

195197
CUDF_HOST_DEVICE device_row_hasher(Nullate check_nulls,
196198
table_device_view t,
197-
uint32_t seed = DEFAULT_HASH_SEED) noexcept
199+
result_type seed = DEFAULT_HASH_SEED) noexcept
198200
: _check_nulls{check_nulls}, _table{t}, _seed(seed)
199201
{
200202
}
201203

202204
Nullate const _check_nulls;
203205
table_device_view const _table;
204-
uint32_t const _seed;
206+
// Assumes seeds are the same as the result type of the hash function
207+
result_type const _seed;
205208
};
206209

207210
/**
@@ -236,11 +239,12 @@ class row_hasher {
236239
/**
237240
* @brief Get the hash operator to use on the device
238241
*
239-
* Returns a unary callable, `F`, with signature `hash_function::hash_value_type F(size_type)`.
240-
*
241-
* `F(i)` returns the hash of row i.
242+
* Returns a unary callable, `F`, where `F(i)` returns the hash value of row i.
242243
*
244+
* @tparam hash_function Hash functor to use for hashing elements
245+
* @tparam DeviceRowHasher The device row hasher type to use
243246
* @tparam Nullate A cudf::nullate type describing whether to check for nulls
247+
*
244248
* @param nullate Indicates if any input column contains nulls
245249
* @param seed The seed to use for the hash function
246250
* @return A hash operator to use on the device
@@ -249,8 +253,9 @@ class row_hasher {
249253
template <typename> class hash_function = cudf::hashing::detail::default_hash,
250254
template <template <typename> class, typename> class DeviceRowHasher = device_row_hasher,
251255
typename Nullate>
252-
DeviceRowHasher<hash_function, Nullate> device_hasher(Nullate nullate = {},
253-
uint32_t seed = DEFAULT_HASH_SEED) const
256+
DeviceRowHasher<hash_function, Nullate> device_hasher(
257+
Nullate nullate = {},
258+
cuda::std::invoke_result_t<hash_function<int32_t>, int32_t> seed = DEFAULT_HASH_SEED) const
254259
{
255260
return DeviceRowHasher<hash_function, Nullate>(nullate, *d_t, seed);
256261
}

cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ class row_equality_comparator {
170170
template <template <typename> class Hash>
171171
class element_hasher {
172172
public:
173+
using result_type = cuda::std::invoke_result_t<Hash<int32_t>, int32_t>;
174+
173175
/**
174176
* @brief Returns the hash value of the given element in the given column.
175177
*
@@ -180,16 +182,16 @@ class element_hasher {
180182
* @return The hash value of the given element
181183
*/
182184
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
183-
__device__ hash_value_type operator()(hash_value_type seed,
184-
column_device_view const& col,
185-
size_type row_index) const
185+
__device__ result_type operator()(result_type seed,
186+
column_device_view const& col,
187+
size_type row_index) const
186188
{
187189
return Hash<T>{seed}(col.element<T>(row_index));
188190
}
189191

190192
// @cond
191193
template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
192-
__device__ hash_value_type operator()(hash_value_type, column_device_view const&, size_type) const
194+
__device__ result_type operator()(result_type, column_device_view const&, size_type) const
193195
{
194196
CUDF_UNREACHABLE("Unsupported type in hash.");
195197
}
@@ -204,6 +206,8 @@ class element_hasher {
204206
template <template <typename> class Hash = cudf::hashing::detail::default_hash>
205207
class row_hasher {
206208
public:
209+
using result_type = cuda::std::invoke_result_t<Hash<int32_t>, int32_t>;
210+
207211
row_hasher() = delete;
208212

209213
/**
@@ -215,7 +219,7 @@ class row_hasher {
215219
*/
216220
row_hasher(cudf::nullate::DYNAMIC const& has_nulls,
217221
table_device_view t,
218-
hash_value_type seed = DEFAULT_HASH_SEED)
222+
result_type seed = DEFAULT_HASH_SEED)
219223
: _has_nulls{has_nulls}, _table{t}, _seed{seed}
220224
{
221225
}
@@ -229,7 +233,7 @@ class row_hasher {
229233
*/
230234
row_hasher(cudf::nullate::DYNAMIC const& has_nulls,
231235
std::shared_ptr<cudf::detail::row::equality::preprocessed_table> t,
232-
hash_value_type seed = DEFAULT_HASH_SEED)
236+
result_type seed = DEFAULT_HASH_SEED)
233237
: _has_nulls{has_nulls}, _table{*t}, _seed{seed}
234238
{
235239
}
@@ -243,8 +247,7 @@ class row_hasher {
243247
__device__ auto operator()(size_type row_index) const
244248
{
245249
element_hasher<Hash> hasher;
246-
// avoid hash combine call if there is only one column
247-
auto hash = cuda::std::numeric_limits<hash_value_type>::max();
250+
auto hash = cuda::std::numeric_limits<result_type>::max();
248251
if (!_has_nulls || !_table.column(0).is_null(row_index)) {
249252
hash = cudf::type_dispatcher<dispatch_primitive_type>(
250253
_table.column(0).type(), hasher, _seed, _table.column(0), row_index);
@@ -257,8 +260,8 @@ class row_hasher {
257260
cudf::type_dispatcher<dispatch_primitive_type>(
258261
_table.column(i).type(), hasher, _seed, _table.column(i), row_index));
259262
} else {
260-
hash = cudf::hashing::detail::hash_combine(
261-
hash, cuda::std::numeric_limits<hash_value_type>::max());
263+
hash =
264+
cudf::hashing::detail::hash_combine(hash, cuda::std::numeric_limits<result_type>::max());
262265
}
263266
}
264267
return hash;
@@ -267,7 +270,7 @@ class row_hasher {
267270
private:
268271
cudf::nullate::DYNAMIC _has_nulls;
269272
table_device_view _table;
270-
hash_value_type _seed;
273+
result_type _seed;
271274
};
272275

273276
} // namespace row::primitive

cpp/include/cudf/hashing/detail/hashing.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <rmm/cuda_stream_view.hpp>
1212

1313
#include <cstddef>
14+
#include <cstdint>
1415
#include <functional>
1516

1617
namespace CUDF_EXPORT cudf {
@@ -95,7 +96,7 @@ CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
9596
* http://www.boost.org/LICENSE_1_0.txt)
9697
*/
9798
/**
98-
* @brief Combines two hash values into a single hash value.
99+
* @brief Combines two 64-bit hash values into a single hash value.
99100
*
100101
* Adapted from Boost hash_combine function and modified for 64-bit.
101102
* https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
@@ -104,7 +105,7 @@ CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
104105
* @param rhs The second hash value
105106
* @return Combined hash value
106107
*/
107-
constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
108+
CUDF_HOST_DEVICE constexpr uint64_t hash_combine(uint64_t lhs, uint64_t rhs)
108109
{
109110
return lhs ^ (rhs + 0x9e37'79b9'7f4a'7c15 + (lhs << 6) + (lhs >> 2));
110111
}

cpp/tests/row_operator/row_operator_tests.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,17 @@
1111
#include <cudf_test/type_lists.hpp>
1212

1313
#include <cudf/detail/row_operator/equality.cuh>
14+
#include <cudf/detail/row_operator/hashing.cuh>
1415
#include <cudf/detail/row_operator/lexicographic.cuh>
16+
#include <cudf/detail/row_operator/primitive_row_operators.cuh>
17+
#include <cudf/hashing/detail/xxhash_64.cuh>
1518
#include <cudf/strings/strings_column_view.hpp>
1619

1720
#include <rmm/cuda_stream_view.hpp>
21+
#include <rmm/exec_policy.hpp>
22+
23+
#include <thrust/iterator/counting_iterator.h>
24+
#include <thrust/transform.h>
1825

1926
template <typename T>
2027
struct TypedTableViewTest : public cudf::test::BaseFixture {};
@@ -332,3 +339,51 @@ TEST_F(RowOperatorTest, TestCheckShapeCompatibility)
332339
cudf::detail::row::equality::two_table_comparator(string_table, numeric_table, stream),
333340
std::invalid_argument);
334341
}
342+
343+
TEST_F(RowOperatorTest, TestRowHasher64BitHash)
344+
{
345+
auto const col = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 123456789}};
346+
auto const input = cudf::table_view{{col}};
347+
348+
auto const stream = cudf::get_default_stream();
349+
auto const preprocessed = cudf::detail::row::hash::preprocessed_table::create(input, stream);
350+
auto const row_hasher = cudf::detail::row::hash::row_hasher{preprocessed};
351+
auto const hasher =
352+
row_hasher.device_hasher<cudf::hashing::detail::XXHash_64>(cudf::nullate::DYNAMIC{false});
353+
354+
auto results = cudf::test::fixed_width_column_wrapper<std::uint64_t>{{0, 0, 0}};
355+
thrust::transform(rmm::exec_policy_nosync(stream),
356+
thrust::counting_iterator<cudf::size_type>{0},
357+
thrust::counting_iterator<cudf::size_type>{3},
358+
cudf::mutable_column_view{results}.begin<std::uint64_t>(),
359+
hasher);
360+
361+
// Expected values are hash_combine(seed=0, xxhash_64(element))
362+
auto const expected = cudf::test::fixed_width_column_wrapper<std::uint64_t>{
363+
{15647511400073222857ul, 8470797489250732038ul, 2416304890555758815ul}};
364+
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results, expected);
365+
}
366+
367+
TEST_F(RowOperatorTest, TestPrimitiveRowHasher64BitHash)
368+
{
369+
auto const col = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 123456789}};
370+
auto const input = cudf::table_view{{col}};
371+
372+
auto const stream = cudf::get_default_stream();
373+
auto const d_input = cudf::table_device_view::create(input, stream);
374+
375+
auto const hasher = cudf::detail::row::primitive::row_hasher<cudf::hashing::detail::XXHash_64>(
376+
cudf::nullate::DYNAMIC{false}, *d_input, static_cast<std::uint64_t>(cudf::DEFAULT_HASH_SEED));
377+
378+
auto results = cudf::test::fixed_width_column_wrapper<std::uint64_t>{{0, 0, 0}};
379+
380+
thrust::transform(rmm::exec_policy_nosync(stream),
381+
thrust::counting_iterator<cudf::size_type>{0},
382+
thrust::counting_iterator<cudf::size_type>{3},
383+
cudf::mutable_column_view{results}.begin<std::uint64_t>(),
384+
hasher);
385+
386+
auto const expected = cudf::test::fixed_width_column_wrapper<std::uint64_t>{
387+
{4246796580750024372ul, 15516826743637085169ul, 9462334144942111946ul}};
388+
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results, expected);
389+
}

0 commit comments

Comments
 (0)