Skip to content

Commit 52911d2

Browse files
committed
pushpmrcode
1 parent a69f53f commit 52911d2

File tree

2 files changed

+143
-126
lines changed

2 files changed

+143
-126
lines changed

foundation/cpp17pmrtest/filter.cpp

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#include <iostream>
2+
#include <chrono>
3+
4+
#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
5+
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"\n";
6+
7+
#include <tbb/parallel_for.h>
8+
#include <tbb/concurrent_vector.h>
9+
#include <tbb/enumerable_thread_specific.h>
10+
#include <immintrin.h>
11+
#include <memory_resource>
12+
#include <random>
13+
14+
template <class T>
15+
struct NoInit {
16+
T value;
17+
NoInit() {} // 不是 = default,也不写 : value(),这样一来只要 T 是 POD 类型,value 就不会0初始化
18+
NoInit(T value_) : value(value_) {} // 强制初始化的版本(T隐式转换为NoInit<T>)
19+
operator T const &() const { return value; } // NoInit<T>隐式转换为T
20+
operator T &() { return value; } // NoInit<T>隐式转换为T
21+
};
22+
23+
__m256i masklut[512];
24+
25+
static int _init_lut = [] {
26+
for (int i = 0; i < 256; i++) {
27+
int per[8] = {0, 0, 0, 0, 0, 0, 0, 0};
28+
for (int j = 0, k = 0; k < 8; k++) {
29+
if (i & (1 << k)) {
30+
per[j++] = k;
31+
}
32+
}
33+
__m256i perm = _mm256_loadu_si256((const __m256i *)per);
34+
_mm256_store_si256(masklut + i * 2, perm);
35+
int c = _mm_popcnt_u32(i);
36+
__m256i mask = _mm256_setr_epi32(
37+
c > 0 ? -1 : 0,
38+
c > 1 ? -1 : 0,
39+
c > 2 ? -1 : 0,
40+
c > 3 ? -1 : 0,
41+
c > 4 ? -1 : 0,
42+
c > 5 ? -1 : 0,
43+
c > 6 ? -1 : 0,
44+
c > 7 ? -1 : 0);
45+
_mm256_store_si256(masklut + i * 2 + 1, mask);
46+
}
47+
return 0;
48+
} ();
49+
50+
// BEGIN CODE
51+
template <int cmp>
52+
size_t filterp(float const *x, size_t n, float y, float *z) {
53+
__m256 pred = _mm256_set1_ps(y);
54+
auto zbeg = z;
55+
auto xend = x + n;
56+
while (x + 16 <= xend) {
57+
__m256 xi = _mm256_loadu_ps(x);
58+
__m256 mask = _mm256_cmp_ps(xi, pred, cmp);
59+
__m256 xi2 = _mm256_loadu_ps(x + 8);
60+
x += 16;
61+
__m256 mask2 = _mm256_cmp_ps(xi2, pred, cmp);
62+
size_t m = (size_t)_mm256_movemask_ps(mask) << 6;
63+
size_t m2 = (size_t)_mm256_movemask_ps(mask2) << 6;
64+
const __m256i *mp = masklut + (m >> 5);
65+
__m256i wa = _mm256_load_si256(mp);
66+
__m256i wb = _mm256_load_si256(mp + 1);
67+
xi = _mm256_permutevar8x32_ps(xi, wa);
68+
_mm256_maskstore_ps(z, wb, xi);
69+
z += _mm_popcnt_u32((unsigned)m);
70+
mp = masklut + (m2 >> 5);
71+
wa = _mm256_load_si256(mp);
72+
wb = _mm256_load_si256(mp + 1);
73+
xi2 = _mm256_permutevar8x32_ps(xi2, wa);
74+
_mm256_maskstore_ps(z, wb, xi2);
75+
z += _mm_popcnt_u32((unsigned)m2);
76+
}
77+
for (; x < xend; x++) {
78+
__m128 xi = _mm_load_ss(x);
79+
__m128 mask = _mm_cmpgt_ss(xi, _mm_set_ss(y));
80+
int m = _mm_extract_ps(mask, 0);
81+
if (m) {
82+
_mm_store_ss(z++, xi);
83+
}
84+
}
85+
return z - zbeg;
86+
}
87+
88+
// newdelete < sync < unsync < monot
89+
90+
int main() {
91+
std::vector<float> scores(65536 * 4096);
92+
// 随机填充学生成绩数据(0~100)
93+
tbb::parallel_for(
94+
tbb::blocked_range<size_t>(0, scores.size()),
95+
[&] (tbb::blocked_range<size_t> r) {
96+
std::mt19937 rng(r.begin());
97+
std::uniform_real_distribution<float> uni(0, 100);
98+
for (size_t i = r.begin(); i != r.end(); ++i) {
99+
scores[i] = uni(rng);
100+
}
101+
});
102+
// 开始过滤出 60 分以下的学生,进行批评教育
103+
TICK(filter);
104+
tbb::concurrent_vector<float> bad_scores;
105+
tbb::enumerable_thread_specific<std::pmr::unsynchronized_pool_resource> pool_ets;
106+
tbb::parallel_for(
107+
tbb::blocked_range<size_t>(0, scores.size(), 65536 * 32),
108+
[&] (tbb::blocked_range<size_t> r) {
109+
#if 1
110+
auto &pool = pool_ets.local();
111+
std::pmr::vector<NoInit<float>> local_bad_scores{&pool};
112+
#else
113+
std::vector<NoInit<float>> local_bad_scores;
114+
#endif
115+
local_bad_scores.resize(r.size());
116+
#if 1
117+
size_t n = filterp<_CMP_LT_OQ>((float *)scores.data() + r.begin(), r.size(), 60, (float *)local_bad_scores.data());
118+
#else
119+
size_t n = 0;
120+
for (size_t i = r.begin(); i != r.end(); ++i) {
121+
float score = scores[i];
122+
if (score < 60) {
123+
local_bad_scores[n++] = score;
124+
}
125+
}
126+
#endif
127+
local_bad_scores.resize(n);
128+
std::copy(local_bad_scores.begin(), local_bad_scores.end(),
129+
bad_scores.grow_by(local_bad_scores.size()));
130+
});
131+
TOCK(filter);
132+
}

foundation/cpp17pmrtest/main.cpp

+11-126
Original file line numberDiff line numberDiff line change
@@ -1,132 +1,17 @@
1-
#include <iostream>
2-
#include <chrono>
3-
4-
#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
5-
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"\n";
6-
7-
#include <tbb/parallel_for.h>
8-
#include <tbb/concurrent_vector.h>
9-
#include <tbb/enumerable_thread_specific.h>
10-
#include <immintrin.h>
1+
#include "ticktock.h"
112
#include <memory_resource>
12-
#include <random>
13-
14-
template <class T>
15-
struct NoInit {
16-
T value;
17-
NoInit() {} // 不是 = default,也不写 : value(),这样一来只要 T 是 POD 类型,value 就不会0初始化
18-
NoInit(T value_) : value(value_) {} // 强制初始化的版本(T隐式转换为NoInit<T>)
19-
operator T const &() const { return value; } // NoInit<T>隐式转换为T
20-
operator T &() { return value; } // NoInit<T>隐式转换为T
21-
};
22-
23-
__m256i masklut[512];
24-
25-
static int _init_lut = [] {
26-
for (int i = 0; i < 256; i++) {
27-
int per[8] = {0, 0, 0, 0, 0, 0, 0, 0};
28-
for (int j = 0, k = 0; k < 8; k++) {
29-
if (i & (1 << k)) {
30-
per[j++] = k;
31-
}
32-
}
33-
__m256i perm = _mm256_loadu_si256((const __m256i *)per);
34-
_mm256_store_si256(masklut + i * 2, perm);
35-
int c = _mm_popcnt_u32(i);
36-
__m256i mask = _mm256_setr_epi32(
37-
c > 0 ? -1 : 0,
38-
c > 1 ? -1 : 0,
39-
c > 2 ? -1 : 0,
40-
c > 3 ? -1 : 0,
41-
c > 4 ? -1 : 0,
42-
c > 5 ? -1 : 0,
43-
c > 6 ? -1 : 0,
44-
c > 7 ? -1 : 0);
45-
_mm256_store_si256(masklut + i * 2 + 1, mask);
46-
}
47-
return 0;
48-
} ();
49-
50-
// BEGIN CODE
51-
template <int cmp>
52-
size_t filterp(float const *x, size_t n, float y, float *z) {
53-
__m256 pred = _mm256_set1_ps(y);
54-
auto zbeg = z;
55-
auto xend = x + n;
56-
while (x + 16 <= xend) {
57-
__m256 xi = _mm256_loadu_ps(x);
58-
__m256 mask = _mm256_cmp_ps(xi, pred, cmp);
59-
__m256 xi2 = _mm256_loadu_ps(x + 8);
60-
x += 16;
61-
__m256 mask2 = _mm256_cmp_ps(xi2, pred, cmp);
62-
size_t m = (size_t)_mm256_movemask_ps(mask) << 6;
63-
size_t m2 = (size_t)_mm256_movemask_ps(mask2) << 6;
64-
const __m256i *mp = masklut + (m >> 5);
65-
__m256i wa = _mm256_load_si256(mp);
66-
__m256i wb = _mm256_load_si256(mp + 1);
67-
xi = _mm256_permutevar8x32_ps(xi, wa);
68-
_mm256_maskstore_ps(z, wb, xi);
69-
z += _mm_popcnt_u32((unsigned)m);
70-
mp = masklut + (m2 >> 5);
71-
wa = _mm256_load_si256(mp);
72-
wb = _mm256_load_si256(mp + 1);
73-
xi2 = _mm256_permutevar8x32_ps(xi2, wa);
74-
_mm256_maskstore_ps(z, wb, xi2);
75-
z += _mm_popcnt_u32((unsigned)m2);
76-
}
77-
for (; x < xend; x++) {
78-
__m128 xi = _mm_load_ss(x);
79-
__m128 mask = _mm_cmpgt_ss(xi, _mm_set_ss(y));
80-
int m = _mm_extract_ps(mask, 0);
81-
if (m) {
82-
_mm_store_ss(z++, xi);
83-
}
84-
}
85-
return z - zbeg;
86-
}
3+
#include "memory_resource_inspector.h"
4+
#include <vector>
5+
#include <list>
6+
#include <deque>
877

888
// newdelete < sync < unsync < monot
899

9010
int main() {
91-
std::vector<float> scores(65536 * 4096);
92-
// 随机填充学生成绩数据(0~100)
93-
tbb::parallel_for(
94-
tbb::blocked_range<size_t>(0, scores.size()),
95-
[&] (tbb::blocked_range<size_t> r) {
96-
std::mt19937 rng(r.begin());
97-
std::uniform_real_distribution<float> uni(0, 100);
98-
for (size_t i = r.begin(); i != r.end(); ++i) {
99-
scores[i] = uni(rng);
100-
}
101-
});
102-
// 开始过滤出 60 分以下的学生,进行批评教育
103-
TICK(filter);
104-
tbb::concurrent_vector<float> bad_scores;
105-
tbb::enumerable_thread_specific<std::pmr::unsynchronized_pool_resource> pool_ets;
106-
tbb::parallel_for(
107-
tbb::blocked_range<size_t>(0, scores.size(), 65536 * 32),
108-
[&] (tbb::blocked_range<size_t> r) {
109-
#if 1
110-
auto &pool = pool_ets.local();
111-
std::pmr::vector<NoInit<float>> local_bad_scores{&pool};
112-
#else
113-
std::vector<NoInit<float>> local_bad_scores;
114-
#endif
115-
local_bad_scores.resize(r.size());
116-
#if 1
117-
size_t n = filterp<_CMP_LT_OQ>((float *)scores.data() + r.begin(), r.size(), 60, (float *)local_bad_scores.data());
118-
#else
119-
size_t n = 0;
120-
for (size_t i = r.begin(); i != r.end(); ++i) {
121-
float score = scores[i];
122-
if (score < 60) {
123-
local_bad_scores[n++] = score;
124-
}
125-
}
126-
#endif
127-
local_bad_scores.resize(n);
128-
std::copy(local_bad_scores.begin(), local_bad_scores.end(),
129-
bad_scores.grow_by(local_bad_scores.size()));
130-
});
131-
TOCK(filter);
11+
memory_resource_inspector mem{std::pmr::new_delete_resource()};
12+
13+
std::pmr::vector<int> s{&mem};
14+
for (int i = 0; i < 4096; i++) {
15+
s.push_back(i);
16+
}
13217
}

0 commit comments

Comments
 (0)