Skip to content

Commit 82bd980

Browse files
authored
[Improvement](hash) opt for pack_fixeds (#59410)
<img width="298" height="2142" alt="图片" src="https://github.com/user-attachments/assets/3be70146-c4bd-4ff7-ac96-2645f89fed14" /> This pull request refactors and optimizes the handling of null maps and key packing in hash join and hash table code, with a focus on improving SIMD (Single Instruction, Multiple Data) usage and simplifying null bitmap logic. The changes replace older byte-searching utilities with new, more efficient SIMD-based functions, update how null bitmaps are packed and processed, and streamline column null data replacement. Additionally, the logic for determining hash key types and handling fixed key serialization is improved for better correctness and performance. Key improvements and changes: ### SIMD utilities and null map handling * Introduced new SIMD-based functions `contain_one` and `contain_zero` in `simd/bits.h`, replacing the older `contain_byte` and related logic for checking the presence of ones or zeros in null maps, resulting in more efficient null detection. * Updated all usages of null map checks throughout the codebase to use the new `contain_one` and `contain_zero` functions, simplifying and unifying the logic for detecting nulls in columns and filters. [[1]](diffhunk://#diff-0732e01c1a3f38997ada381c43aff98286e86ca7519db5469a6e4dcdec5bce44L195-L200) [[2]](diffhunk://#diff-3110bab7d558f46b88ae1958b09ac369a92cac4bff98b280b2cf83d2d7aecbf4L117-R118) [[3]](diffhunk://#diff-3110bab7d558f46b88ae1958b09ac369a92cac4bff98b280b2cf83d2d7aecbf4L369-R371) [[4]](diffhunk://#diff-8981dd2e1f08aaa46a97aeef27bd906c64d1bb08deedc0fe1d94c1c49dc064ceL100-R100) [[5]](diffhunk://#diff-9fd61a223bcb3b7a9cb93c2d26c9364d8cce2131673fe286f22a80b09c6fd2c6L283-R283) [[6]](diffhunk://#diff-9fd61a223bcb3b7a9cb93c2d26c9364d8cce2131673fe286f22a80b09c6fd2c6L601-R605) ### Hash key and null bitmap packing * Refactored the logic for packing null maps into hash keys in `MethodKeysFixed`, introducing new templates and helper functions for interleaved null map packing, and replacing the old bitmap size calculation with a simplified approach. This improves both performance and maintainability. [[1]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3R478-R540) [[2]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3L500-R611) * Updated the logic for initializing and inserting keys, ensuring correct handling of nulls and simplifying offset calculations for key data. [[1]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3R653) [[2]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3L619-R692) [[3]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3L645-R712) ### Column null data replacement * Simplified the `replace_column_null_data` methods for vector and decimal columns by removing unnecessary null count checks and optimizing the replacement logic. [[1]](diffhunk://#diff-3fa47f544ff08bb2c8232af99312c0bbf2c58cac9da7a2b06473282b99ad5aa4L528-R530) [[2]](diffhunk://#diff-5fdf450def955da3201cc889aa870d94def054d1168f1ef3def32e8f009dc65aL526-L529) ### Hash key type logic * Improved the logic for determining the hash key type in `hash_key_type.h` to handle cases where the number of data types exceeds the bit size, defaulting to serialized keys as needed. [[1]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R83-R86) [[2]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144L97-R101) ### Code cleanup and dependency updates * Removed unused functions and updated includes to ensure all SIMD utilities are properly imported where needed. [[1]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3R20-R26) [[2]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3R36) [[3]](diffhunk://#diff-b8623712a5a1728bb77cc67b6ee1bbf16ef2b842044f6f6bab64c3fc5c4575f3L292-L295) These changes collectively improve performance, maintainability, and correctness in hash join operations, especially in handling nullable columns and SIMD optimizations.
1 parent 4a0e4af commit 82bd980

File tree

14 files changed

+325
-90
lines changed

14 files changed

+325
-90
lines changed

be/src/pipeline/exec/hashjoin_build_sink.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,8 @@ struct ProcessHashTableBuild {
192192
bool* has_null_key) {
193193
if (null_map) {
194194
// first row is mocked and is null
195-
// TODO: Need to test the for loop. break may better
196-
for (uint32_t i = 1; i < _rows; i++) {
197-
if ((*null_map)[i]) {
198-
*has_null_key = true;
199-
}
195+
if (simd::contain_one(null_map->data() + 1, _rows - 1)) {
196+
*has_null_key = true;
200197
}
201198
if (short_circuit_for_null && *has_null_key) {
202199
return Status::OK();
@@ -208,7 +205,7 @@ struct ProcessHashTableBuild {
208205
_rows, _batch_size, *has_null_key, hash_table_ctx.direct_mapping_range());
209206

210207
// In order to make the null keys equal when using single null eq, all null keys need to be set to default value.
211-
if (_build_raw_ptrs.size() == 1 && null_map) {
208+
if (_build_raw_ptrs.size() == 1 && null_map && *has_null_key) {
212209
_build_raw_ptrs[0]->assume_mutable()->replace_column_null_data(null_map->data());
213210
}
214211

be/src/pipeline/exec/join/process_hash_table_probe_impl.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ void ProcessHashTableProbe<JoinOpType>::build_side_output_column(vectorized::Mut
114114
_build_column_has_null[i] = false;
115115
if (_right_output_slot_flags[i] && column.is_nullable()) {
116116
const auto& nullable = assert_cast<const vectorized::ColumnNullable&>(column);
117-
_build_column_has_null[i] = !simd::contain_byte(
118-
nullable.get_null_map_data().data() + 1, nullable.size() - 1, 1);
117+
_build_column_has_null[i] = !simd::contain_one(
118+
nullable.get_null_map_data().data() + 1, nullable.size() - 1);
119119
}
120120
}
121121
}
@@ -192,7 +192,9 @@ typename HashTableType::State ProcessHashTableProbe<JoinOpType>::_init_probe_sid
192192
hash_table_ctx.arena.clear();
193193
// In order to make the null keys equal when using single null eq, all null keys need to be set to default value.
194194
if (_parent->_probe_columns.size() == 1 && null_map) {
195-
_parent->_probe_columns[0]->assume_mutable()->replace_column_null_data(null_map);
195+
if (simd::contain_one(null_map, probe_rows)) {
196+
_parent->_probe_columns[0]->assume_mutable()->replace_column_null_data(null_map);
197+
}
196198
}
197199

198200
hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map, true,
@@ -366,8 +368,7 @@ Status ProcessHashTableProbe<JoinOpType>::finalize_block_with_filter(
366368
}
367369
const auto& column_filter =
368370
assert_cast<const vectorized::ColumnUInt8*>(filter_ptr.get())->get_data();
369-
bool need_filter =
370-
simd::count_zero_num((int8_t*)column_filter.data(), column_filter.size()) != 0;
371+
bool need_filter = simd::contain_zero(column_filter.data(), column_filter.size());
371372
if (need_filter) {
372373
row_indexs.filter(column_filter);
373374
}

be/src/pipeline/exec/nested_loop_join_probe_operator.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,7 @@ class NestedLoopJoinProbeLocalState final
9797
}
9898
if (!_cur_probe_row_visited_flags[i]) {
9999
_cur_probe_row_visited_flags[i] =
100-
simd::contain_byte<uint8_t>(filter.data() + offset, end - offset, 1)
101-
? 1
102-
: 0;
100+
simd::contain_one(filter.data() + offset, end - offset);
103101
}
104102
end = offset;
105103
}

be/src/util/simd/bits.h

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -261,14 +261,6 @@ static size_t find_byte(const T* data, size_t start, size_t end, T byte) {
261261
return (T*)p - data;
262262
}
263263

264-
template <typename T>
265-
bool contain_byte(const T* __restrict data, const size_t length, const signed char byte) {
266-
if (length == 0) {
267-
return false;
268-
}
269-
return nullptr != std::memchr(reinterpret_cast<const void*>(data), byte, length);
270-
}
271-
272264
inline size_t find_one(const std::vector<uint8_t>& vec, size_t start) {
273265
return find_byte<uint8_t>(vec, start, 1);
274266
}
@@ -281,5 +273,58 @@ inline size_t find_zero(const std::vector<uint8_t>& vec, size_t start) {
281273
return find_byte<uint8_t>(vec, start, 0);
282274
}
283275

276+
inline bool contain_one(const uint8_t* __restrict data, size_t size) {
277+
size_t i = 0;
278+
#if defined(__AVX2__)
279+
for (; i + 32 <= size; i += 32) {
280+
__m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data + i));
281+
if (!_mm256_testz_si256(chunk, chunk)) {
282+
return true;
283+
}
284+
}
285+
#elif defined(__SSE2__)
286+
const __m128i zero = _mm_setzero_si128();
287+
for (; i + 16 <= size; i += 16) {
288+
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));
289+
if (_mm_movemask_epi8(_mm_cmpeq_epi8(chunk, zero)) != 0xFFFF) {
290+
return true;
291+
}
292+
}
293+
#endif
294+
for (; i < size; ++i) {
295+
if (data[i]) {
296+
return true;
297+
}
298+
}
299+
return false;
300+
}
301+
302+
inline bool contain_zero(const uint8_t* __restrict data, size_t size) {
303+
size_t i = 0;
304+
#if defined(__AVX2__)
305+
const __m256i zero = _mm256_setzero_si256();
306+
for (; i + 32 <= size; i += 32) {
307+
__m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data + i));
308+
if (_mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, zero)) != 0) {
309+
return true;
310+
}
311+
}
312+
#elif defined(__SSE2__)
313+
const __m128i zero = _mm_setzero_si128();
314+
for (; i + 16 <= size; i += 16) {
315+
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));
316+
if (_mm_movemask_epi8(_mm_cmpeq_epi8(chunk, zero)) != 0) {
317+
return true;
318+
}
319+
}
320+
#endif
321+
for (; i < size; ++i) {
322+
if (!data[i]) {
323+
return true;
324+
}
325+
}
326+
return false;
327+
}
328+
284329
} // namespace doris::simd
285330
#include "common/compile_check_end.h"

be/src/vec/columns/column_decimal.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -521,10 +521,6 @@ void ColumnDecimal<T>::compare_internal(size_t rhs_row_id, const IColumn& rhs,
521521
template <PrimitiveType T>
522522
void ColumnDecimal<T>::replace_column_null_data(const uint8_t* __restrict null_map) {
523523
auto s = size();
524-
size_t null_count = s - simd::count_zero_num((const int8_t*)null_map, s);
525-
if (0 == null_count) {
526-
return;
527-
}
528524
for (size_t i = 0; i < s; ++i) {
529525
data[i] = null_map[i] ? value_type() : data[i];
530526
}

be/src/vec/columns/column_nullable.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ size_t ColumnNullable::serialize_impl(char* pos, const size_t row) const {
280280
}
281281

282282
void ColumnNullable::serialize(StringRef* keys, size_t num_rows) const {
283-
const bool has_null = simd::contain_byte(get_null_map_data().data(), num_rows, 1);
283+
const bool has_null = simd::contain_one(get_null_map_data().data(), num_rows);
284284
const auto* __restrict null_map =
285285
assert_cast<const ColumnUInt8&>(get_null_map_column()).get_data().data();
286286
_nested_column->serialize_with_nullable(keys, num_rows, has_null, null_map);
@@ -598,11 +598,11 @@ void ColumnNullable::sort_column(const ColumnSorter* sorter, EqualFlags& flags,
598598
}
599599

600600
bool ColumnNullable::only_null() const {
601-
return !simd::contain_byte(get_null_map_data().data(), size(), 0);
601+
return !simd::contain_zero(get_null_map_data().data(), size());
602602
}
603603

604604
bool ColumnNullable::has_null(size_t begin, size_t end) const {
605-
return simd::contain_byte(get_null_map_data().data() + begin, end - begin, 1);
605+
return simd::contain_one(get_null_map_data().data() + begin, end - begin);
606606
}
607607

608608
bool ColumnNullable::has_null() const {

be/src/vec/columns/column_vector.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -525,12 +525,9 @@ MutableColumnPtr ColumnVector<T>::permute(const IColumn::Permutation& perm, size
525525
template <PrimitiveType T>
526526
void ColumnVector<T>::replace_column_null_data(const uint8_t* __restrict null_map) {
527527
auto s = size();
528-
size_t null_count = s - simd::count_zero_num((const int8_t*)null_map, s);
529-
if (0 == null_count) {
530-
return;
531-
}
528+
auto value = default_value();
532529
for (size_t i = 0; i < s; ++i) {
533-
data[i] = null_map[i] ? default_value() : data[i];
530+
data[i] = null_map[i] ? value : data[i];
534531
}
535532
}
536533

be/src/vec/common/hash_table/hash_key_type.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) {
8080
}
8181

8282
inline HashKeyType get_hash_key_type_fixed(const std::vector<vectorized::DataTypePtr>& data_types) {
83+
if (data_types.size() >= vectorized::BITSIZE) {
84+
return HashKeyType::serialized;
85+
}
86+
8387
bool has_null = false;
8488
size_t key_byte_size = 0;
8589

@@ -94,8 +98,7 @@ inline HashKeyType get_hash_key_type_fixed(const std::vector<vectorized::DataTyp
9498
}
9599
}
96100

97-
size_t bitmap_size = has_null ? vectorized::get_bitmap_size(data_types.size()) : 0;
98-
return get_hash_key_type_with_fixed(bitmap_size + key_byte_size);
101+
return get_hash_key_type_with_fixed(has_null + key_byte_size);
99102
}
100103

101104
inline HashKeyType get_hash_key_type(const std::vector<vectorized::DataTypePtr>& data_types) {

0 commit comments

Comments
 (0)