|
1 |
| -#include <iostream> |
2 |
| -#include <chrono> |
3 |
| - |
4 |
| -#define TICK(x) auto bench_##x = std::chrono::steady_clock::now(); |
5 |
| -#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n"; |
6 |
| - |
7 |
| -#include <tbb/parallel_for.h> |
8 |
| -#include <tbb/concurrent_vector.h> |
9 |
| -#include <tbb/enumerable_thread_specific.h> |
10 |
| -#include <immintrin.h> |
| 1 | +#include "ticktock.h" |
11 | 2 | #include <memory_resource>
|
12 |
| -#include <random> |
13 |
| - |
14 |
| -template <class T> |
15 |
| -struct NoInit { |
16 |
| - T value; |
17 |
| - NoInit() {} // 不是 = default,也不写 : value(),这样一来只要 T 是 POD 类型,value 就不会0初始化 |
18 |
| - NoInit(T value_) : value(value_) {} // 强制初始化的版本(T隐式转换为NoInit<T>) |
19 |
| - operator T const &() const { return value; } // NoInit<T>隐式转换为T |
20 |
| - operator T &() { return value; } // NoInit<T>隐式转换为T |
21 |
| -}; |
22 |
| - |
23 |
| -__m256i masklut[512]; |
24 |
| - |
25 |
| -static int _init_lut = [] { |
26 |
| - for (int i = 0; i < 256; i++) { |
27 |
| - int per[8] = {0, 0, 0, 0, 0, 0, 0, 0}; |
28 |
| - for (int j = 0, k = 0; k < 8; k++) { |
29 |
| - if (i & (1 << k)) { |
30 |
| - per[j++] = k; |
31 |
| - } |
32 |
| - } |
33 |
| - __m256i perm = _mm256_loadu_si256((const __m256i *)per); |
34 |
| - _mm256_store_si256(masklut + i * 2, perm); |
35 |
| - int c = _mm_popcnt_u32(i); |
36 |
| - __m256i mask = _mm256_setr_epi32( |
37 |
| - c > 0 ? -1 : 0, |
38 |
| - c > 1 ? -1 : 0, |
39 |
| - c > 2 ? -1 : 0, |
40 |
| - c > 3 ? -1 : 0, |
41 |
| - c > 4 ? -1 : 0, |
42 |
| - c > 5 ? -1 : 0, |
43 |
| - c > 6 ? -1 : 0, |
44 |
| - c > 7 ? -1 : 0); |
45 |
| - _mm256_store_si256(masklut + i * 2 + 1, mask); |
46 |
| - } |
47 |
| - return 0; |
48 |
| -} (); |
49 |
| - |
50 |
| -// BEGIN CODE |
51 |
| -template <int cmp> |
52 |
| -size_t filterp(float const *x, size_t n, float y, float *z) { |
53 |
| - __m256 pred = _mm256_set1_ps(y); |
54 |
| - auto zbeg = z; |
55 |
| - auto xend = x + n; |
56 |
| - while (x + 16 <= xend) { |
57 |
| - __m256 xi = _mm256_loadu_ps(x); |
58 |
| - __m256 mask = _mm256_cmp_ps(xi, pred, cmp); |
59 |
| - __m256 xi2 = _mm256_loadu_ps(x + 8); |
60 |
| - x += 16; |
61 |
| - __m256 mask2 = _mm256_cmp_ps(xi2, pred, cmp); |
62 |
| - size_t m = (size_t)_mm256_movemask_ps(mask) << 6; |
63 |
| - size_t m2 = (size_t)_mm256_movemask_ps(mask2) << 6; |
64 |
| - const __m256i *mp = masklut + (m >> 5); |
65 |
| - __m256i wa = _mm256_load_si256(mp); |
66 |
| - __m256i wb = _mm256_load_si256(mp + 1); |
67 |
| - xi = _mm256_permutevar8x32_ps(xi, wa); |
68 |
| - _mm256_maskstore_ps(z, wb, xi); |
69 |
| - z += _mm_popcnt_u32((unsigned)m); |
70 |
| - mp = masklut + (m2 >> 5); |
71 |
| - wa = _mm256_load_si256(mp); |
72 |
| - wb = _mm256_load_si256(mp + 1); |
73 |
| - xi2 = _mm256_permutevar8x32_ps(xi2, wa); |
74 |
| - _mm256_maskstore_ps(z, wb, xi2); |
75 |
| - z += _mm_popcnt_u32((unsigned)m2); |
76 |
| - } |
77 |
| - for (; x < xend; x++) { |
78 |
| - __m128 xi = _mm_load_ss(x); |
79 |
| - __m128 mask = _mm_cmpgt_ss(xi, _mm_set_ss(y)); |
80 |
| - int m = _mm_extract_ps(mask, 0); |
81 |
| - if (m) { |
82 |
| - _mm_store_ss(z++, xi); |
83 |
| - } |
84 |
| - } |
85 |
| - return z - zbeg; |
86 |
| -} |
| 3 | +#include "memory_resource_inspector.h" |
| 4 | +#include <vector> |
| 5 | +#include <list> |
| 6 | +#include <deque> |
87 | 7 |
|
88 | 8 | // newdelete < sync < unsync < monot
|
89 | 9 |
|
90 | 10 | int main() {
|
91 |
| - std::vector<float> scores(65536 * 4096); |
92 |
| - // 随机填充学生成绩数据(0~100) |
93 |
| - tbb::parallel_for( |
94 |
| - tbb::blocked_range<size_t>(0, scores.size()), |
95 |
| - [&] (tbb::blocked_range<size_t> r) { |
96 |
| - std::mt19937 rng(r.begin()); |
97 |
| - std::uniform_real_distribution<float> uni(0, 100); |
98 |
| - for (size_t i = r.begin(); i != r.end(); ++i) { |
99 |
| - scores[i] = uni(rng); |
100 |
| - } |
101 |
| - }); |
102 |
| - // 开始过滤出 60 分以下的学生,进行批评教育 |
103 |
| - TICK(filter); |
104 |
| - tbb::concurrent_vector<float> bad_scores; |
105 |
| - tbb::enumerable_thread_specific<std::pmr::unsynchronized_pool_resource> pool_ets; |
106 |
| - tbb::parallel_for( |
107 |
| - tbb::blocked_range<size_t>(0, scores.size(), 65536 * 32), |
108 |
| - [&] (tbb::blocked_range<size_t> r) { |
109 |
| - #if 1 |
110 |
| - auto &pool = pool_ets.local(); |
111 |
| - std::pmr::vector<NoInit<float>> local_bad_scores{&pool}; |
112 |
| - #else |
113 |
| - std::vector<NoInit<float>> local_bad_scores; |
114 |
| - #endif |
115 |
| - local_bad_scores.resize(r.size()); |
116 |
| - #if 1 |
117 |
| - size_t n = filterp<_CMP_LT_OQ>((float *)scores.data() + r.begin(), r.size(), 60, (float *)local_bad_scores.data()); |
118 |
| - #else |
119 |
| - size_t n = 0; |
120 |
| - for (size_t i = r.begin(); i != r.end(); ++i) { |
121 |
| - float score = scores[i]; |
122 |
| - if (score < 60) { |
123 |
| - local_bad_scores[n++] = score; |
124 |
| - } |
125 |
| - } |
126 |
| - #endif |
127 |
| - local_bad_scores.resize(n); |
128 |
| - std::copy(local_bad_scores.begin(), local_bad_scores.end(), |
129 |
| - bad_scores.grow_by(local_bad_scores.size())); |
130 |
| - }); |
131 |
| - TOCK(filter); |
| 11 | + memory_resource_inspector mem{std::pmr::new_delete_resource()}; |
| 12 | + |
| 13 | + std::pmr::vector<int> s{&mem}; |
| 14 | + for (int i = 0; i < 4096; i++) { |
| 15 | + s.push_back(i); |
| 16 | + } |
132 | 17 | }
|
0 commit comments