Skip to content

Commit eba3c39

Browse files
committed
将所有算子函数输入参数修改为const类型, 实现gather算子的cuda版本
1 parent d98e6a9 commit eba3c39

File tree

31 files changed

+362
-189
lines changed

31 files changed

+362
-189
lines changed

env.sh

Lines changed: 0 additions & 1 deletion
This file was deleted.

include/ops/clip/clip.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t hand
1515
infiniopTensorDescriptor_t y
1616
);
1717

18-
__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream);
18+
__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream);
1919

2020
__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
2121

include/ops/gather/gather.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t ha
1717
int64_t axis
1818
);
1919

20-
__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream);
20+
__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void const *indices, void *y, void *stream);
2121

2222
__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
2323

include/ops/reducemax/reducemax.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
2525
#endif

include/ops/reducemean/reducemean.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
2525
#endif

include/ops/reducemin/reducemin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
2525
#endif

include/ops/where/where.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t han
1717
infiniopTensorDescriptor_t condition
1818
);
1919

20-
__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream);
20+
__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void const *src1, void const *src2, void const *condition, void *stream);
2121

2222
__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
2323

operatorspy/tests/gather.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,10 @@ class GatherDescriptor(Structure):
3030

3131
infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
3232

33-
def gather(input, indices, axis):
34-
np_input = input.numpy()
35-
np_indices = indices.numpy()
36-
np_output = np.take(np_input, np_indices, axis=axis)
37-
return torch.from_numpy(np_output)
33+
def gather(x, indices, axis = 0):
34+
idx = [slice(None)] * x.ndim
35+
idx[axis] = indices
36+
return x[tuple(idx)]
3837

3938
def tuple_to_void_p(py_tuple: Tuple):
4039
array = ctypes.c_int64 * len(py_tuple)
@@ -55,16 +54,19 @@ def test(
5554
tensor_dtype=torch.float16
5655
):
5756
print(
58-
f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
57+
f"Testing gather on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
5958
)
6059
x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
61-
if len(x.shape) == 2:
62-
indices = torch.tensor(2, dtype=torch.int64, device=torch_device)
63-
elif len(x.shape) == 3:
64-
indices = torch.tensor([[0, 1], [1, 2]], dtype=torch.int64, device=torch_device)
60+
if isinstance(indices_shape, int):
61+
indices_shape_tuple = (indices_shape,)
62+
else:
63+
indices_shape_tuple = tuple(indices_shape)
64+
indices = torch.randint(0, x.shape[axis], indices_shape_tuple,
65+
device=torch_device).type(torch.int64)
6566
dst = torch.randn(inferShape(x_shape, indices.shape, axis), dtype=tensor_dtype, device=torch_device)
67+
6668
ans = gather(x, indices, axis)
67-
axis = axis
69+
6870
x_tensor = to_tensor(x, lib)
6971
indices_tensor = to_tensor(indices, lib)
7072
dst_tensor = to_tensor(dst, lib)
@@ -106,25 +108,35 @@ def test(
106108
)
107109
elapsed = (time.time() - start_time) / NUM_ITERATIONS
108110
print(f"lib time: {elapsed :10f}")
109-
print(f"pytorch ans: {ans}")
110-
print(f"lib ans: {dst}")
111+
ans = ans.to(torch_device)
111112
assert torch.allclose(dst, ans, atol=0, rtol=0)
112113
check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
113114

114115
def test_cpu(lib, test_cases):
115116
device = DeviceEnum.DEVICE_CPU
116117
handle = create_handle(lib, device)
117-
for x_shape, indices_shape, axis in test_cases:
118-
test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=torch.float16)
119-
print("\n")
120-
#test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
118+
for x_shape, indices_shape, axis, tensor_dtype in test_cases:
119+
test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
120+
destroy_handle(lib, handle)
121+
122+
def test_cuda(lib, test_cases):
123+
device = DeviceEnum.DEVICE_CUDA
124+
handle = create_handle(lib, device)
125+
for x_shape, indices_shape, axis, tensor_dtype in test_cases:
126+
test(lib, handle, "cuda", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
121127
destroy_handle(lib, handle)
122128

123129

124130
if __name__ == "__main__":
125131
test_cases = [
126-
((3, 4), (2), 0),
127-
((2, 3, 4), (2, 2), 1),
132+
((3, 4), (2), 0, torch.float32),
133+
((64, 64), (64, 64), 0, torch.float32),
134+
((64, 64), (64, 64), 1, torch.float32),
135+
((2, 3, 4), (2, 2), 1, torch.float32),
136+
((64, 64), (64, 64), 0, torch.float16),
137+
((64, 64), (64, 64), 1, torch.float16),
138+
((8, 8, 8, 8, 8), (8, 8), 0, torch.float16),
139+
((8, 8, 8, 8, 8), (8, 8), 2, torch.float16),
128140
]
129141
args = get_args()
130142
lib = open_lib()
@@ -144,5 +156,8 @@ def test_cpu(lib, test_cases):
144156
]
145157
lib.infiniopDestroyGatherDescriptor.restype = c_int32
146158
lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopGatherDescriptor_t]
147-
test_cpu(lib, test_cases)
159+
if args.cuda:
160+
test_cuda(lib, test_cases)
161+
if args.cpu:
162+
test_cpu(lib, test_cases)
148163
print("All tests passed!")

src/ops/clip/cuda/clip_cuda.cu

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
88
#define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
99

10-
__global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min_value, int N){
10+
#define LDST128BITS_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
11+
#define FLOAT4_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
12+
13+
__global__ void clip_f32x4_kernel(const float *a, float *b, float max_value, float min_value, int N){
1114
int idx = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
1215
if (idx < N) {
1316
int remaining = N - idx;
1417
float4 reg_a, reg_b;
1518
if (remaining >= 4) {
16-
reg_a = FLOAT4(a[idx]);
19+
reg_a = FLOAT4_CONST(a[idx]);
1720
} else {
1821
reg_a.x = a[idx];
1922
reg_a.y = (remaining >= 2) ? a[idx + 1] : 0;
@@ -35,14 +38,14 @@ __global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min
3538
}
3639

3740

38-
__global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float min_value, int N){
41+
__global__ void clip_f16x8_pack_kernel(const half *a, half *b, float max_value, float min_value, int N){
3942
int idx = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
4043
if (idx >= N) return;
4144
const half min_half = __float2half(min_value);
4245
const half max_half = __float2half(max_value);
4346
half pack_a[8], pack_b[8];
4447
if (idx + 7 < N) {
45-
LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
48+
LDST128BITS(pack_a[0]) = LDST128BITS_CONST(a[idx]);
4649
} else {
4750
for (int i = 0; i < 8 && (idx + i) < N; i++) {
4851
pack_a[i] = a[idx + i];
@@ -65,7 +68,7 @@ __global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float
6568
template<typename Tdata>
6669
infiniopStatus_t clip_nv_gpu(
6770
ClipCudaDescriptor_t desc,
68-
void *x,
71+
void const *x,
6972
void *y,
7073
float min_value,
7174
float max_value,
@@ -75,45 +78,15 @@ infiniopStatus_t clip_nv_gpu(
7578
dim3 block(256 / per_thread_element);
7679
dim3 grid((N + 256 - 1) / 256);
7780
if constexpr(std::is_same<Tdata, float>::value){
78-
clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
79-
}else{
80-
clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
81-
}
82-
/*
83-
if (desc->ndim != 2){
84-
dim3 block(256 / per_thread_element);
85-
dim3 grid((N + 256 - 1) / 256);
86-
if constexpr(std::is_same<Tdata, float>::value){
87-
clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
88-
}else{
89-
clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
90-
}
81+
clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
9182
}else{
92-
if ((desc->K / per_thread_element) <= 1024){
93-
dim3 block(desc->K / (per_thread_element));
94-
dim3 grid(desc->S);
95-
if constexpr(std::is_same<Tdata, float>::value){
96-
clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
97-
}else{
98-
clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
99-
}
100-
}
101-
else{
102-
dim3 block(256 / per_thread_element);
103-
dim3 grid((N + 256 - 1) / 256);
104-
if constexpr(std::is_same<Tdata, float>::value){
105-
clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), min_value, max_value, N);
106-
}else{
107-
clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), min_value, max_value, N);
108-
}
109-
}
83+
clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
11084
}
111-
*/
11285
return STATUS_SUCCESS;
11386
}
11487

11588
infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
116-
void *x,
89+
void const *x,
11790
void *y,
11891
float *min,
11992
float *max,

src/ops/clip/cuda/clip_cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
2222

2323

2424
infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
25-
void *x,
25+
void const *x,
2626
void *y,
2727
float *min,
2828
float *max,

0 commit comments

Comments
 (0)