InfiniTensor
diff --git a/‎env.sh‎
Lines changed: 0 additions & 1 deletion b/‎env.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/ops/clip/clip.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/clip/clip.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/gather/gather.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/gather/gather.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemax/reducemax.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemax/reducemax.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemean/reducemean.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemean/reducemean.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemin/reducemin.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemin/reducemin.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/where/where.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/where/where.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎operatorspy/tests/gather.py‎
Lines changed: 35 additions & 20 deletions b/‎operatorspy/tests/gather.py‎
Lines changed: 35 additions & 20 deletions
diff --git a/‎src/ops/clip/cuda/clip_cuda.cu‎
Lines changed: 11 additions & 38 deletions b/‎src/ops/clip/cuda/clip_cuda.cu‎
Lines changed: 11 additions & 38 deletions
diff --git a/‎src/ops/clip/cuda/clip_cuda.h‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/clip/cuda/clip_cuda.h‎
Lines changed: 1 addition & 1 deletion
@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t hand
                                                                 infiniopTensorDescriptor_t y
                                                                 );
 
-__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream);
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
 
 
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t ha
                                                                 int64_t axis
                                                                 );
 
-__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream);
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void const *indices, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
 
 
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
 #endif
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t han
                                                             infiniopTensorDescriptor_t condition
                                                             );
 
-__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream);
+__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void const *src1, void const *src2, void const *condition, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
 
 
@@ -30,11 +30,10 @@ class GatherDescriptor(Structure):
 
 infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
 
-def gather(input, indices, axis):
-    np_input = input.numpy()
-    np_indices = indices.numpy()
-    np_output = np.take(np_input, np_indices, axis=axis)
-    return torch.from_numpy(np_output)
+def gather(x, indices, axis = 0):
+    idx = [slice(None)] * x.ndim
+    idx[axis] = indices
+    return x[tuple(idx)]
 
 def tuple_to_void_p(py_tuple: Tuple):
     array = ctypes.c_int64 * len(py_tuple)
@@ -55,16 +54,19 @@ def test(
     tensor_dtype=torch.float16
 ):
     print(
-        f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+        f"Testing gather on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
     )
     x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
-    if len(x.shape) == 2:
-        indices = torch.tensor(2, dtype=torch.int64, device=torch_device)
-    elif len(x.shape) == 3:
-        indices = torch.tensor([[0, 1], [1, 2]], dtype=torch.int64, device=torch_device)
+    if isinstance(indices_shape, int):
+        indices_shape_tuple = (indices_shape,)
+    else:
+        indices_shape_tuple = tuple(indices_shape)
+    indices = torch.randint(0, x.shape[axis], indices_shape_tuple, 
+                       device=torch_device).type(torch.int64)
     dst = torch.randn(inferShape(x_shape, indices.shape, axis), dtype=tensor_dtype, device=torch_device)
+
     ans = gather(x, indices, axis)
-    axis = axis
+
     x_tensor = to_tensor(x, lib)
     indices_tensor = to_tensor(indices, lib)
     dst_tensor = to_tensor(dst, lib)
@@ -106,25 +108,35 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"pytorch ans: {ans}")
-    print(f"lib ans: {dst}")
+    ans = ans.to(torch_device)
     assert torch.allclose(dst, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, indices_shape, axis in test_cases:
-        test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=torch.float16)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    for x_shape, indices_shape, axis, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, indices_shape, axis, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((3, 4), (2), 0),
-        ((2, 3, 4), (2, 2), 1),
+        ((3, 4), (2), 0, torch.float32),
+        ((64, 64), (64, 64), 0, torch.float32),
+        ((64, 64), (64, 64), 1, torch.float32),
+        ((2, 3, 4), (2, 2), 1, torch.float32),
+        ((64, 64), (64, 64), 0, torch.float16),
+        ((64, 64), (64, 64), 1, torch.float16),
+        ((8, 8, 8, 8, 8), (8, 8), 0, torch.float16),
+        ((8, 8, 8, 8, 8), (8, 8), 2, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -144,5 +156,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyGatherDescriptor.restype = c_int32
     lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopGatherDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
     print("All tests passed!")
@@ -7,13 +7,16 @@
 #define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
 #define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
 
-__global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min_value, int N){
+#define LDST128BITS_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
+#define FLOAT4_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
+
+__global__ void clip_f32x4_kernel(const float *a, float *b, float max_value, float min_value, int N){
     int idx = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
     if (idx < N) {
         int remaining = N - idx;
         float4 reg_a, reg_b;
         if (remaining >= 4) {
-        reg_a = FLOAT4(a[idx]);
+        reg_a = FLOAT4_CONST(a[idx]);
         } else {
             reg_a.x = a[idx];
             reg_a.y = (remaining >= 2) ? a[idx + 1] : 0;
@@ -35,14 +38,14 @@ __global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min
 }
 
 
-__global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float min_value, int N){
+__global__ void clip_f16x8_pack_kernel(const half *a, half *b, float max_value, float min_value, int N){
     int idx = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
     if (idx >= N) return;
     const half min_half = __float2half(min_value);
     const half max_half = __float2half(max_value);
     half pack_a[8], pack_b[8];
     if (idx + 7 < N) {
-        LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
+        LDST128BITS(pack_a[0]) = LDST128BITS_CONST(a[idx]);
     } else {
         for (int i = 0; i < 8 && (idx + i) < N; i++) {
             pack_a[i] = a[idx + i];
@@ -65,7 +68,7 @@ __global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float
 template<typename Tdata>
 infiniopStatus_t clip_nv_gpu(
     ClipCudaDescriptor_t desc,
-    void *x,
+    void const *x,
     void *y,
     float min_value,
     float max_value,
@@ -75,45 +78,15 @@ infiniopStatus_t clip_nv_gpu(
     dim3 block(256 / per_thread_element);
     dim3 grid((N + 256 - 1) / 256);
     if constexpr(std::is_same<Tdata, float>::value){
-        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-    }else{
-        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-    }
-    /*
-    if (desc->ndim != 2){
-        dim3 block(256 / per_thread_element);
-        dim3 grid((N + 256 - 1) / 256);
-        if constexpr(std::is_same<Tdata, float>::value){
-            clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-        }else{
-            clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-        }
+        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
     }else{
-        if ((desc->K / per_thread_element) <= 1024){
-            dim3 block(desc->K / (per_thread_element));                                   
-            dim3 grid(desc->S);
-            if constexpr(std::is_same<Tdata, float>::value){
-                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-            }else{
-                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-            }
-        }
-        else{
-            dim3 block(256 / per_thread_element);
-            dim3 grid((N + 256 - 1) / 256);
-            if constexpr(std::is_same<Tdata, float>::value){
-                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), min_value, max_value, N);
-            }else{
-                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), min_value, max_value, N);
-            }
-        }
+        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
     }
-        */
     return STATUS_SUCCESS;
 }
 
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
-    void *x,
+    void const *x,
     void *y,
     float *min,
     float *max,
 
@@ -22,7 +22,7 @@ infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
 
 
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
-                            void *x,
+                            void const *x,
                             void *y,
                             float *min,
                             float *max,