Skip to content

Commit 2c8c1ee

Browse files
authored
Upgrade datasketches lib from 3.3.0 to 4.1.0 (#684)
* Upgrade datasketches lib from 3.3.0 to 4.1.0 - quantile inclusive by default now (no required pace workaround anymore) - Fix float implicit conversions and some compiler warnings * Update CMakeLists.txt --------- Co-authored-by: Leonardo Parente <[email protected]>
1 parent 22bd64d commit 2c8c1ee

33 files changed

+1263
-1180
lines changed

3rd/datasketches/common/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,9 @@ target_sources(common
3737
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
3838
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
40-
)
40+
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
41+
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov_impl.hpp
42+
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view.hpp
43+
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view_impl.hpp
44+
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
45+
)

3rd/datasketches/common/include/MurmurHash3.h

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,30 @@ typedef unsigned char uint8_t;
2929
typedef unsigned int uint32_t;
3030
typedef unsigned __int64 uint64_t;
3131

32-
#define FORCE_INLINE __forceinline
32+
#define MURMUR3_FORCE_INLINE __forceinline
3333

3434
#include <stdlib.h>
3535

36-
#define ROTL32(x,y) _rotl(x,y)
37-
#define ROTL64(x,y) _rotl64(x,y)
36+
#define MURMUR3_ROTL64(x,y) _rotl64(x,y)
3837

39-
#define BIG_CONSTANT(x) (x)
38+
#define MURMUR3_BIG_CONSTANT(x) (x)
4039

4140
// Other compilers
4241

4342
#else // defined(_MSC_VER)
4443

4544
#include <stdint.h>
4645

47-
#define FORCE_INLINE inline __attribute__((always_inline))
48-
49-
inline uint32_t rotl32 ( uint32_t x, int8_t r )
50-
{
51-
return (x << r) | (x >> (32 - r));
52-
}
46+
#define MURMUR3_FORCE_INLINE inline __attribute__((always_inline))
5347

5448
inline uint64_t rotl64 ( uint64_t x, int8_t r )
5549
{
5650
return (x << r) | (x >> (64 - r));
5751
}
5852

59-
#define ROTL32(x,y) rotl32(x,y)
60-
#define ROTL64(x,y) rotl64(x,y)
53+
#define MURMUR3_ROTL64(x,y) rotl64(x,y)
6154

62-
#define BIG_CONSTANT(x) (x##LLU)
55+
#define MURMUR3_BIG_CONSTANT(x) (x##LLU)
6356

6457
#endif // !defined(_MSC_VER)
6558

@@ -78,7 +71,7 @@ typedef struct {
7871
// Block read - if your platform needs to do endian-swapping or can only
7972
// handle aligned reads, do the conversion here
8073

81-
FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, size_t i )
74+
MURMUR3_FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, size_t i )
8275
{
8376
uint64_t res;
8477
memcpy(&res, p + i, sizeof(res));
@@ -88,20 +81,21 @@ FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, size_t i )
8881
//-----------------------------------------------------------------------------
8982
// Finalization mix - force all bits of a hash block to avalanche
9083

91-
FORCE_INLINE uint64_t fmix64 ( uint64_t k )
84+
MURMUR3_FORCE_INLINE uint64_t fmix64 ( uint64_t k )
9285
{
9386
k ^= k >> 33;
94-
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
87+
k *= MURMUR3_BIG_CONSTANT(0xff51afd7ed558ccd);
9588
k ^= k >> 33;
96-
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
89+
k *= MURMUR3_BIG_CONSTANT(0xc4ceb9fe1a85ec53);
9790
k ^= k >> 33;
9891

9992
return k;
10093
}
10194

102-
FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t seed, HashState& out) {
103-
static const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
104-
static const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
95+
MURMUR3_FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes,
96+
uint64_t seed, HashState& out) {
97+
static const uint64_t c1 = MURMUR3_BIG_CONSTANT(0x87c37b91114253d5);
98+
static const uint64_t c2 = MURMUR3_BIG_CONSTANT(0x4cf5ad432745937f);
10599

106100
const uint8_t* data = (const uint8_t*)key;
107101

@@ -118,13 +112,13 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t
118112
uint64_t k1 = getblock64(blocks, i * 2 + 0);
119113
uint64_t k2 = getblock64(blocks, i * 2 + 1);
120114

121-
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
122-
out.h1 = ROTL64(out.h1,27);
115+
k1 *= c1; k1 = MURMUR3_ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
116+
out.h1 = MURMUR3_ROTL64(out.h1,27);
123117
out.h1 += out.h2;
124118
out.h1 = out.h1*5+0x52dce729;
125119

126-
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
127-
out.h2 = ROTL64(out.h2,31);
120+
k2 *= c2; k2 = MURMUR3_ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
121+
out.h2 = MURMUR3_ROTL64(out.h2,31);
128122
out.h2 += out.h1;
129123
out.h2 = out.h2*5+0x38495ab5;
130124
}
@@ -144,7 +138,7 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t
144138
case 11: k2 ^= ((uint64_t)tail[10]) << 16; // falls through
145139
case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; // falls through
146140
case 9: k2 ^= ((uint64_t)tail[ 8]) << 0;
147-
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
141+
k2 *= c2; k2 = MURMUR3_ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
148142
// falls through
149143
case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; // falls through
150144
case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; // falls through
@@ -154,7 +148,7 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t
154148
case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; // falls through
155149
case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; // falls through
156150
case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;
157-
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
151+
k1 *= c1; k1 = MURMUR3_ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
158152
};
159153

160154
//----------
@@ -175,10 +169,14 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t
175169

176170
//-----------------------------------------------------------------------------
177171

178-
FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
172+
MURMUR3_FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
179173
HashState hashes;
180174
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
181175
return static_cast<uint16_t>(hashes.h1 & 0xffff);
182176
}
183177

178+
#undef MURMUR3_FORCE_INLINE
179+
#undef MURMUR3_ROTL64
180+
#undef MURMUR3_BIG_CONSTANT
181+
184182
#endif // _MURMURHASH3_H_

3rd/datasketches/common/include/binomial_bounds.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include <algorithm>
2424
#include <cmath>
25+
#include <stdexcept>
2526

2627
/*
2728
* This class enables the estimation of error bounds given a sample set size, the sampling

3rd/datasketches/common/include/common_defs.hpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#include <string>
2525
#include <memory>
2626
#include <iostream>
27+
#include <random>
28+
#include <chrono>
29+
#include <thread>
2730

2831
namespace datasketches {
2932

@@ -34,6 +37,19 @@ enum resize_factor { X1 = 0, X2, X4, X8 };
3437
template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
3538
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
3639

40+
// thread-safe random bit
41+
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
42+
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
43+
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));
44+
45+
// common random declarations
46+
namespace random_utils {
47+
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
48+
static thread_local std::mt19937_64 rand(rd());
49+
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
50+
}
51+
52+
3753
// utility function to hide unused compiler warning
3854
// usually has no additional cost
3955
template<typename T> void unused(T&&...) {}
@@ -63,7 +79,7 @@ static inline void read(std::istream& is, T* ptr, size_t size_bytes) {
6379
}
6480

6581
template<typename T>
66-
static inline void write(std::ostream& os, T& value) {
82+
static inline void write(std::ostream& os, T value) {
6783
os.write(reinterpret_cast<const char*>(&value), sizeof(T));
6884
}
6985

@@ -72,6 +88,16 @@ static inline void write(std::ostream& os, const T* ptr, size_t size_bytes) {
7288
os.write(reinterpret_cast<const char*>(ptr), size_bytes);
7389
}
7490

91+
// wrapper for iterators to implement operator-> returning temporary value
92+
template<typename T>
93+
class return_value_holder {
94+
public:
95+
return_value_holder(T value): value_(value) {}
96+
const T* operator->() const { return std::addressof(value_); }
97+
private:
98+
T value_;
99+
};
100+
75101
} // namespace
76102

77103
#endif // _COMMON_DEFS_HPP_

3rd/datasketches/common/include/count_zeros.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,17 @@ static inline uint8_t count_leading_zeros_in_u64(uint64_t input) {
9191
return 56 + byte_leading_zeros_table[(input ) & FCLZ_MASK_08];
9292
}
9393

94+
static inline uint8_t count_leading_zeros_in_u32(uint32_t input) {
95+
if (input > FCLZ_MASK_24)
96+
return byte_leading_zeros_table[(input >> 24) & FCLZ_MASK_08];
97+
if (input > FCLZ_MASK_16)
98+
return 8 + byte_leading_zeros_table[(input >> 16) & FCLZ_MASK_08];
99+
if (input > FCLZ_MASK_08)
100+
return 16 + byte_leading_zeros_table[(input >> 8) & FCLZ_MASK_08];
101+
if (true)
102+
return 24 + byte_leading_zeros_table[(input ) & FCLZ_MASK_08];
103+
}
104+
94105
static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
95106
for (int i = 0; i < 4; i++) {
96107
const int byte = input & 0xff;

3rd/datasketches/kll/include/kolmogorov_smirnov.hpp renamed to 3rd/datasketches/common/include/kolmogorov_smirnov.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ namespace datasketches {
2525
class kolmogorov_smirnov {
2626
public:
2727
/**
28-
* Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
28+
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
29+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
2930
* @param sketch1 KLL sketch 1
3031
* @param sketch2 KLL sketch 2
3132
* @return the raw delta between two KLL quantile sketches
@@ -37,6 +38,7 @@ class kolmogorov_smirnov {
3738
* Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
3839
* Adjusts the computed threshold by the error epsilons of the two given sketches.
3940
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
41+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
4042
* @param sketch1 KLL sketch 1
4143
* @param sketch2 KLL sketch 2
4244
* @param p Target p-value. Typically .001 to .1, e.g., .05.
@@ -46,7 +48,8 @@ class kolmogorov_smirnov {
4648
static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
4749

4850
/**
49-
* Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
51+
* Performs the Kolmogorov-Smirnov Test between two quantile sketches.
52+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
5053
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
5154
* this will return false.
5255
* @param sketch1 KLL sketch 1
@@ -57,7 +60,6 @@ class kolmogorov_smirnov {
5760
*/
5861
template<typename Sketch>
5962
static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60-
6163
};
6264

6365
} /* namespace datasketches */

3rd/datasketches/kll/include/kolmogorov_smirnov_impl.hpp renamed to 3rd/datasketches/common/include/kolmogorov_smirnov_impl.hpp

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,39 +20,36 @@
2020
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
2121
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
2222

23-
namespace datasketches {
23+
#include <cmath>
24+
#include <algorithm>
2425

25-
// type resolver
26-
template<typename T, typename C, typename S, typename A>
27-
kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28-
return kll_quantile_calculator<T, C, A>(sketch);
29-
}
26+
namespace datasketches {
3027

3128
template<typename Sketch>
3229
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33-
using Comparator = typename Sketch::comparator;
34-
auto calc1 = make_quantile_calculator(sketch1);
35-
auto calc2 = make_quantile_calculator(sketch2);
36-
auto it1 = calc1.begin();
37-
auto it2 = calc2.begin();
30+
auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
31+
auto view1 = sketch1.get_sorted_view();
32+
auto view2 = sketch2.get_sorted_view();
33+
auto it1 = view1.begin();
34+
auto it2 = view2.begin();
3835
const auto n1 = sketch1.get_n();
3936
const auto n2 = sketch2.get_n();
4037
double delta = 0;
41-
while (it1 != calc1.end() && it2 != calc2.end()) {
42-
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43-
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
38+
while (it1 != view1.end() && it2 != view2.end()) {
39+
const double norm_cum_wt1 = static_cast<double>(it1.get_cumulative_weight(false)) / n1;
40+
const double norm_cum_wt2 = static_cast<double>(it2.get_cumulative_weight(false)) / n2;
4441
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45-
if (Comparator()((*it1).first, (*it2).first)) {
42+
if (comparator((*it1).first, (*it2).first)) {
4643
++it1;
47-
} else if (Comparator()((*it2).first, (*it1).first)) {
44+
} else if (comparator((*it2).first, (*it1).first)) {
4845
++it2;
4946
} else {
5047
++it1;
5148
++it2;
5249
}
5350
}
54-
const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55-
const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
51+
const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>(it1.get_cumulative_weight(false)) / n1;
52+
const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>(it2.get_cumulative_weight(false)) / n2;
5653
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
5754
return delta;
5855
}

3rd/datasketches/common/include/memory_operations.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <memory>
2424
#include <exception>
2525
#include <iostream>
26+
#include <string>
27+
#include <cstring>
2628

2729
namespace datasketches {
2830

@@ -53,14 +55,14 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
5355
}
5456

5557
template<typename T>
56-
static inline size_t copy_to_mem(const T& item, void* dst) {
57-
memcpy(dst, &item, sizeof(T));
58+
static inline size_t copy_from_mem(const void* src, T& item) {
59+
memcpy(&item, src, sizeof(T));
5860
return sizeof(T);
5961
}
6062

6163
template<typename T>
62-
static inline size_t copy_from_mem(const void* src, T& item) {
63-
memcpy(&item, src, sizeof(T));
64+
static inline size_t copy_to_mem(T item, void* dst) {
65+
memcpy(dst, &item, sizeof(T));
6466
return sizeof(T);
6567
}
6668

0 commit comments

Comments
 (0)