Skip to content

Commit db56f32

Browse files
committed
enable prediction-quantization with compaction
- No external gather is needed any more; compaction inside prediction kernel is utilized. - Also prediction kernel throughput is significantly improved due to less DRAM write.
1 parent ef5e2a5 commit db56f32

File tree

13 files changed

+94
-68
lines changed

13 files changed

+94
-68
lines changed

include/kernel/l23r.hh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void psz_adhoc_scttr(
2424
T* val, uint32_t* idx, int const n, T* out, float* milliseconds,
2525
cudaStream_t stream);
2626

27-
template <typename T, bool UsePnEnc = false, typename Eq = uint32_t>
27+
template <typename T, typename Eq = uint32_t, bool UsePnEnc = false>
2828
cusz_error_status psz_comp_l23r(
2929
T* const data, dim3 const len3, double const eb, int const radius,
3030
Eq* const eq, void* _outlier, float* time_elapsed, cudaStream_t stream);
Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* @file compact_cuda.inl
2+
* @file compact_cu.hh
33
* @author Jiannan Tian
44
* @brief
55
* @version 0.4
@@ -32,9 +32,9 @@ struct CompactCudaDram {
3232
using type = T;
3333

3434
// `h_` for host-accessible
35-
T *val, *h_val;
36-
uint32_t *idx, *h_idx;
37-
uint32_t *num, h_num{0};
35+
T *d_val, *h_val;
36+
uint32_t *d_idx, *h_idx;
37+
uint32_t *d_num, h_num{0};
3838
size_t reserved_len;
3939

4040
// CompactCudaDram() {}
@@ -48,10 +48,10 @@ struct CompactCudaDram {
4848

4949
CompactCudaDram& malloc()
5050
{
51-
cudaMalloc(&val, sizeof(T) * reserved_len);
52-
cudaMalloc(&idx, sizeof(uint32_t) * reserved_len);
53-
cudaMalloc(&num, sizeof(uint32_t) * 1);
54-
cudaMemset(num, 0x0, sizeof(T) * 1); // init val
51+
cudaMalloc(&d_val, sizeof(T) * reserved_len);
52+
cudaMalloc(&d_idx, sizeof(uint32_t) * reserved_len);
53+
cudaMalloc(&d_num, sizeof(uint32_t) * 1);
54+
cudaMemset(d_num, 0x0, sizeof(T) * 1); // init d_val
5555

5656
return *this;
5757
}
@@ -66,7 +66,7 @@ struct CompactCudaDram {
6666

6767
CompactCudaDram& free()
6868
{
69-
cudaFree(idx), cudaFree(val), cudaFree(num);
69+
cudaFree(d_idx), cudaFree(d_val), cudaFree(d_num);
7070
return *this;
7171
}
7272

@@ -79,10 +79,10 @@ struct CompactCudaDram {
7979
// memcpy
8080
CompactCudaDram& make_host_accessible(cudaStream_t stream = 0)
8181
{
82-
cudaMemcpyAsync(&h_num, num, 1 * sizeof(uint32_t), d2h, stream);
82+
cudaMemcpyAsync(&h_num, d_num, 1 * sizeof(uint32_t), d2h, stream);
8383
cudaStreamSynchronize(stream);
84-
cudaMemcpyAsync(h_val, val, sizeof(T) * (h_num), d2h, stream);
85-
cudaMemcpyAsync(h_idx, idx, sizeof(uint32_t) * (h_num), d2h, stream);
84+
cudaMemcpyAsync(h_val, d_val, sizeof(T) * (h_num), d2h, stream);
85+
cudaMemcpyAsync(h_idx, d_idx, sizeof(uint32_t) * (h_num), d2h, stream);
8686
cudaStreamSynchronize(stream);
8787

8888
return *this;
@@ -110,6 +110,9 @@ struct CompactCudaDram {
110110

111111
// accessor
112112
uint32_t num_outliers() { return h_num; }
113+
T* val() { return d_val; }
114+
uint32_t* idx() { return d_idx; }
115+
uint32_t* num() { return d_num; }
113116
};
114117

115118
#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* @file compact_serial.inl
2+
* @file compact_ser.hh
33
* @author Jiannan Tian
44
* @brief
55
* @version 0.4

include/mem/layout_cxx.hh

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
#ifndef DC62DA60_8211_4C93_9541_950ADEFC2820
1313
#define DC62DA60_8211_4C93_9541_950ADEFC2820
1414

15+
#include "compact_cu.hh"
1516
#include "layout.h"
16-
#include "mem/memseg.h"
17+
#include "memseg.h"
1718
#include "memseg_cxx.hh"
1819

1920
template <typename T, typename E, typename H>
@@ -34,6 +35,8 @@ class pszmempool_cxx {
3435
pszmem_cxx<T> *sv; // sp-val
3536
pszmem_cxx<M> *si; // sp-idx
3637

38+
CompactCudaDram<T> *compact;
39+
3740
pszmem_cxx<B> *_compressed; // compressed
3841

3942
size_t len;
@@ -52,8 +55,11 @@ class pszmempool_cxx {
5255
F *hist() { return ht->dptr(); }
5356
E *ectrl_lrz() { return el->dptr(); }
5457
E *ectrl_spl() { return es->dptr(); }
55-
B *compressed() { return _compressed->dptr(); };
56-
B *compressed_h() { return _compressed->hptr(); };
58+
B *compressed() { return _compressed->dptr(); }
59+
B *compressed_h() { return _compressed->hptr(); }
60+
T *compact_val() { return compact->val(); }
61+
M *compact_idx() { return compact->idx(); }
62+
M compact_num_outliers() { return compact->num_outliers(); }
5763
};
5864

5965
#define TPL template <typename T, typename E, typename H>
@@ -90,6 +96,8 @@ TPL POOL::pszmempool_cxx(_u4 x, int _radius, _u4 y, _u4 z)
9096
sv = new pszmem_cxx<T>(x, y, z, "sp-val");
9197
si = new pszmem_cxx<M>(x, y, z, "sp-idx");
9298

99+
compact = new CompactCudaDram<T>;
100+
93101
_compressed->control({Malloc, MallocHost});
94102
oc->control({Malloc, MallocHost});
95103
ac->control({Malloc, MallocHost});
@@ -98,13 +106,16 @@ TPL POOL::pszmempool_cxx(_u4 x, int _radius, _u4 y, _u4 z)
98106
sv->control({Malloc, MallocHost});
99107
si->control({Malloc, MallocHost});
100108

109+
compact->reserve_space(len / 5).control({Malloc, MallocHost});
110+
101111
el->asaviewof(e);
102112
es->asaviewof(e);
103113
}
104114

105115
TPL POOL::~pszmempool_cxx()
106116
{
107117
delete ac, delete e, delete oc, delete ht, delete sv, delete si;
118+
compact->control({Free, FreeHost});
108119
}
109120

110121
TPL POOL *POOL::clear_buffer()

src/kernel/detail/l23r.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#include <type_traits>
1818

1919
#include "cusz/suint.hh"
20-
#include "pipeline/compact_cuda.inl"
20+
#include "mem/compact_cu.hh"
2121

2222
namespace psz {
2323
namespace rolling {

src/kernel/detail/lproto.inl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include <stdexcept>
1919

2020
#include "../../utils/it_cuda.hh"
21-
#include "pipeline/compact_cuda.inl"
21+
#include "mem/compact_cu.hh"
2222
#include "utils/cuda_err.cuh"
2323
#include "utils/timer.h"
2424

@@ -48,9 +48,9 @@ __global__ void c_lorenzo_1d1l(T* in_data, dim3 len3, dim3 stride3, int radius,
4848
if (check_boundary1()) { // postquant
4949
eq[id] = quantizable * static_cast<Eq>(candidate);
5050
if (not quantizable) {
51-
auto dram_idx = atomicAdd(compact.num, 1);
52-
compact.val[dram_idx] = candidate;
53-
compact.idx[dram_idx] = id;
51+
auto dram_idx = atomicAdd(compact.d_num, 1);
52+
compact.d_val[dram_idx] = candidate;
53+
compact.d_idx[dram_idx] = id;
5454
}
5555
}
5656
}
@@ -79,9 +79,9 @@ __global__ void c_lorenzo_2d1l(T* in_data, dim3 len3, dim3 stride3, int radius,
7979
if (check_boundary2()) {
8080
eq[id] = quantizable * static_cast<Eq>(candidate);
8181
if (not quantizable) {
82-
auto dram_idx = atomicAdd(compact.num, 1);
83-
compact.val[dram_idx] = candidate;
84-
compact.idx[dram_idx] = id;
82+
auto dram_idx = atomicAdd(compact.d_num, 1);
83+
compact.d_val[dram_idx] = candidate;
84+
compact.d_idx[dram_idx] = id;
8585
}
8686
}
8787
}
@@ -112,9 +112,9 @@ __global__ void c_lorenzo_3d1l(T* in_data, dim3 len3, dim3 stride3, int radius,
112112
if (check_boundary3()) {
113113
eq[id] = quantizable * static_cast<Eq>(candidate);
114114
if (not quantizable) {
115-
auto dram_idx = atomicAdd(compact.num, 1);
116-
compact.val[dram_idx] = candidate;
117-
compact.idx[dram_idx] = id;
115+
auto dram_idx = atomicAdd(compact.d_num, 1);
116+
compact.d_val[dram_idx] = candidate;
117+
compact.d_idx[dram_idx] = id;
118118
}
119119
}
120120
}

src/kernel/detail/subroutine.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <stdint.h>
1313
#include <type_traits>
1414
#include "cusz/suint.hh"
15-
#include "pipeline/compact_cuda.inl"
15+
#include "mem/compact_cu.hh"
1616
#include "subsub.inl"
1717

1818
namespace psz {

src/kernel/l23r_cu.cu

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
#include "cusz/type.h"
1515
#include "detail/l23r.inl"
1616
#include "kernel/l23r.hh"
17-
#include "pipeline/compact_cuda.inl"
17+
#include "mem/compact_cu.hh"
1818
#include "utils/cuda_err.cuh"
1919
#include "utils/timer.h"
2020

21-
template <typename T, bool UsePnEnc, typename Eq>
21+
template <typename T, typename Eq, bool UsePnEnc>
2222
cusz_error_status psz_comp_l23r(
2323
T* const data, dim3 const len3, double const eb, int const radius,
2424
Eq* const eq, void* _outlier, float* time_elapsed, cudaStream_t stream)
@@ -72,17 +72,20 @@ cusz_error_status psz_comp_l23r(
7272
if (d == 1) {
7373
psz::rolling::c_lorenzo_1d1l<T, false, Eq, T, Tile1D, Seq1D>
7474
<<<Grid1D, Block1D, 0, stream>>>(
75-
data, len3, leap3, radius, ebx2_r, eq, ot->val, ot->idx, ot->num);
75+
data, len3, leap3, radius, ebx2_r, eq, ot->val(), ot->idx(),
76+
ot->num());
7677
}
7778
else if (d == 2) {
7879
psz::rolling::c_lorenzo_2d1l<T, false, Eq, T>
7980
<<<Grid2D, Block2D, 0, stream>>>(
80-
data, len3, leap3, radius, ebx2_r, eq, ot->val, ot->idx, ot->num);
81+
data, len3, leap3, radius, ebx2_r, eq, ot->val(), ot->idx(),
82+
ot->num());
8183
}
8284
else if (d == 3) {
8385
psz::rolling::c_lorenzo_3d1l<T, false, Eq, T>
8486
<<<Grid3D, Block3D, 0, stream>>>(
85-
data, len3, leap3, radius, ebx2_r, eq, ot->val, ot->idx, ot->num);
87+
data, len3, leap3, radius, ebx2_r, eq, ot->val(), ot->idx(),
88+
ot->num());
8689
}
8790

8891
STOP_CUDAEVENT_RECORDING(stream);
@@ -93,22 +96,22 @@ cusz_error_status psz_comp_l23r(
9396
return CUSZ_SUCCESS;
9497
}
9598

96-
template cusz_error_status psz_comp_l23r<float, false>(
99+
template cusz_error_status psz_comp_l23r<float, uint32_t, false>(
97100
float* const data, dim3 const len3, double const eb, int const radius,
98101
uint32_t* const eq, void* _outlier, float* time_elapsed,
99102
cudaStream_t stream);
100103

101-
template cusz_error_status psz_comp_l23r<float, true>(
104+
template cusz_error_status psz_comp_l23r<float, uint32_t, true>(
102105
float* const data, dim3 const len3, double const eb, int const radius,
103106
uint32_t* const eq, void* _outlier, float* time_elapsed,
104107
cudaStream_t stream);
105108

106-
template cusz_error_status psz_comp_l23r<double, false>(
109+
template cusz_error_status psz_comp_l23r<double, uint32_t, false>(
107110
double* const data, dim3 const len3, double const eb, int const radius,
108111
uint32_t* const eq, void* _outlier, float* time_elapsed,
109112
cudaStream_t stream);
110113

111-
template cusz_error_status psz_comp_l23r<double, true>(
114+
template cusz_error_status psz_comp_l23r<double, uint32_t, true>(
112115
double* const data, dim3 const len3, double const eb, int const radius,
113116
uint32_t* const eq, void* _outlier, float* time_elapsed,
114117
cudaStream_t stream);

src/kernel/lproto_cu.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
#include "cusz/type.h"
1313
#include "kernel/lproto.hh"
14-
#include "pipeline/compact_cuda.inl"
14+
#include "mem/compact_cu.hh"
1515
#include "utils/cuda_err.cuh"
1616
#include "utils/timer.h"
1717

src/pipeline/compressor.inl

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@
2525
#include "kernel/hist.hh"
2626
#include "kernel/histsp.hh"
2727
#include "kernel/l23.hh"
28+
#include "kernel/l23r.hh"
2829
#include "kernel/spv_gpu.hh"
2930
#include "mem/layout.h"
3031
#include "mem/layout_cxx.hh"
3132
#include "mem/memseg_cxx.hh"
32-
#include "kernel/hist.hh"
3333
#include "utils/config.hh"
3434
#include "utils/cuda_err.cuh"
3535

@@ -79,12 +79,10 @@ Compressor<C>* Compressor<C>::init(CONFIG* config, bool debug)
7979

8080
codec->init(len, booklen, pardeg, debug);
8181
mem = new pszmempool_cxx<T, E, H>(x, radius, y, z);
82-
82+
8383
return this;
8484
}
8585

86-
87-
8886
template <class C>
8987
Compressor<C>* Compressor<C>::compress(
9088
cusz_context* config, T* in, BYTE*& out, size_t& outlen,
@@ -113,39 +111,50 @@ Compressor<C>* Compressor<C>::compress(
113111
auto sublen = div(data_len, pardeg);
114112

115113
auto update_header = [&]() {
116-
header.x = len3.x;
117-
header.y = len3.y;
118-
header.z = len3.z;
114+
header.x = len3.x, header.y = len3.y, header.z = len3.z,
119115
header.w = 1; // placeholder
120-
header.radius = radius;
116+
header.radius = radius, header.eb = eb;
121117
header.vle_pardeg = pardeg;
122-
header.eb = eb;
123118
header.splen = splen;
124119
// header.byte_vle = use_fallback_codec ? 8 : 4;
125120
};
126121

127122
/******************************************************************************/
128123

129-
psz_comp_l23<T, E, FP>(
130-
in, len3, eb, radius, mem->ectrl_lrz(), mem->outlier_space(), &time_pred,
124+
// Below is substituted with prediction+compaction
125+
/*
126+
psz_comp_l23<T, E>(
127+
in, len3, eb, radius, mem->ectrl_lrz(), mem->outlier_space(),
128+
&time_pred, stream);
129+
*/
130+
psz_comp_l23r<T, E>(
131+
in, len3, eb, radius, mem->ectrl_lrz(), (void*)mem->compact, &time_pred,
131132
stream);
132-
psz::histogram<psz_policy::CUDA, E>(
133+
psz::histogram<CUDA, E>(
134+
mem->ectrl_lrz(), len, mem->hist(), booklen, &time_hist, stream);
135+
psz::histsp<CUDA, E>(
133136
mem->ectrl_lrz(), len, mem->hist(), booklen, &time_hist, stream);
134137
codec->build_codebook(mem->hist(), booklen, stream);
135138
codec->encode(mem->ectrl_lrz(), len, &d_codec_out, &codec_outlen, stream);
136-
psz::spv_gather<T, M>(
137-
mem->outlier_space(), len, mem->outlier_val(), mem->outlier_idx(),
138-
&splen, &time_sp, stream);
139+
/*
140+
psz::spv_gather<T, M>(
141+
mem->outlier_space(), len, mem->outlier_val(), mem->outlier_idx(),
142+
&splen, &time_sp, stream);
143+
*/
139144

140-
/* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
145+
mem->compact->make_host_accessible(stream);
146+
147+
splen = mem->compact->num_outliers();
148+
149+
// /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
141150

142151
/******************************************************************************/
143152

144153
update_header();
145154

146155
merge_subfiles(
147-
d_codec_out, codec_outlen, mem->outlier_val(), mem->outlier_idx(), splen,
148-
stream);
156+
d_codec_out, codec_outlen, mem->compact_val(), mem->compact_idx(),
157+
mem->compact->num_outliers(), stream);
149158

150159
// output
151160
outlen = psz_utils::filesize(&header);

0 commit comments

Comments
 (0)