YdrMaster · xgqdut2016 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 20, 2024
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -23,6 +23,7 @@ jobs:
 
     - name: Install Python dependencies
       run: |
+        pip install numpy
         pip install torch
 
     - name: Install xmake

diff --git a/include/device.h b/include/device.h
@@ -6,6 +6,7 @@ enum DeviceEnum {
     DevNvGpu,
     DevCambriconMlu,
     DevAscendNpu,
+    DevTecoSDAA,
 };
 
 typedef enum DeviceEnum Device;

diff --git a/include/infinirt.h b/include/infinirt.h
@@ -0,0 +1,78 @@
+#ifndef INFINI_RUNTIME_H
+#define INFINI_RUNTIME_H
+
+#if defined(_WIN32)
+#define __export __declspec(dllexport)
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define __export __attribute__((visibility("default")))
+#else
+#define __export
+#endif
+
+#ifdef __cplusplus
+#define __C extern "C"
+#else
+#define __C
+#endif
+#include <stddef.h>
+#include <stdint.h>
+
+typedef enum
+{
+    DEVICE_CPU,
+    DEVICE_NVIDIA,
+    DEVICE_CAMBRICON,
+    DEVICE_ASCEND,
+    DEVICE_TECO,
+} DeviceType;
+
+typedef enum
+{
+    INFINIRT_STATUS_SUCCESS = 0,
+    INFINIRT_STATUS_EXECUTION_FAILED = 1,
+    INFINIRT_STATUS_BAD_DEVICE = 2,
+    INFINIRT_STATUS_DEVICE_NOT_SUPPORTED = 3,
+    INFINIRT_STATUS_DEVICE_MISMATCH = 4,
+    INFINIRT_STATUS_INVALID_ARGUMENT = 5,
+    INFINIRT_STATUS_ILLEGAL_MEMORY_ACCESS = 6,
+    INFINIRT_STATUS_NOT_READY = 7,
+} infinirtStatus_t;
+
+__C __export infinirtStatus_t infinirtInit(DeviceType device);
+
+// Device
+__C __export infinirtStatus_t infinirtDeviceSynchronize(DeviceType device, uint32_t deviceId);
+
+// Stream
+struct infinirtStream;
+typedef struct infinirtStream *infinirtStream_t;
+#define INFINIRT_NULL_STREAM nullptr
+__C __export infinirtStatus_t infinirtStreamCreate(infinirtStream_t *pStream, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtStreamDestroy(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtStreamSynchronize(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetRawStream(void** ptr, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetStreamDeviceInfo(DeviceType* deviceType, uint32_t *deviceId, infinirtStream_t stream);
+
+// Event
+struct infinirtEvent;
+typedef struct infinirtEvent *infinirtEvent_t;
+__C __export infinirtStatus_t infinirtEventCreate(infinirtEvent_t *pEvent, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtEventRecord(infinirtEvent_t event, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtEventQuery(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventSynchronize(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventDestroy(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtStreamWaitEvent(infinirtEvent_t event, infinirtStream_t stream);
+
+// Memory
+__C __export infinirtStatus_t infinirtMalloc(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMallocAsync(void **pMemory, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMallocHost(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtFree(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtFreeAsync(void *ptr, DeviceType device, uint32_t deviceId, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtFreeHost(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtMemcpyH2D(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyH2DAsync(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMemcpyD2H(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpy(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyAsync(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+#endif
diff --git a/operatorspy/devices.py b/operatorspy/devices.py
@@ -3,3 +3,4 @@ class DeviceEnum:
     DEVICE_CUDA = 1
     DEVICE_BANG = 2
     DEVICE_ASCEND = 3
+    DEVICE_TECO = 4
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
@@ -43,6 +43,7 @@ def find_library_in_ld_path(library_name):
         paths = ld_library_path.split(os.pathsep)
         for path in paths:
             full_path = os.path.join(path, library_name)
+            print(full_path)
             if os.path.isfile(full_path):
                 return full_path
         return None

diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
@@ -77,13 +77,15 @@ def test(
 
     ans = matmul(c, beta, a, b, alpha)
 
+
     if a_stride is not None:
         a = rearrange_tensor(a, a_stride)
     if b_stride is not None:
         b = rearrange_tensor(b, b_stride)
     if c_stride is not None:
         c = rearrange_tensor(c, c_stride)
-
+    ans = matmul(c, beta, a, b, alpha)
+
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
     c_tensor = to_tensor(c, lib)
@@ -99,7 +101,7 @@ def test(
             beta
         )
     )
-
+    print(a.stride(),b.stride(),c.stride())
     workspace_size = c_uint64(0)
     check_error(
         lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
@@ -117,8 +119,7 @@ def test(
             None,
         )
     )
-
-    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
 
     if PROFILE:
         for i in range(NUM_PRERUN):
@@ -157,6 +158,7 @@ def test(
         print(f"    lib time: {elapsed :6f}")
 
     check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
+    print("Test passed!")
 
 
 def test_cpu(lib, test_cases):
@@ -292,6 +294,40 @@ def test_ascend(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
 if __name__ == "__main__":
     test_cases = [
         # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
@@ -352,4 +388,6 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
+    if args.teco:
+        test_sdaa(lib,test_cases)
+    print("Test passed!")
diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py
@@ -240,6 +240,37 @@ def test_bang(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+        )
+
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
@@ -307,4 +338,6 @@ def test_bang(lib, test_cases):
         test_bang(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang):
         test_cpu(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     print("Test passed!")
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
@@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     else:
         end = topk
 
-
-
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
@@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
 
 def random_sample_0(data):
     return torch.argmax(data)
+
 def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
     print(
         f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
     )
-
-    data = torch.rand((voc), dtype=x_dtype).to(torch_device)
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    data = data[_perm].to(x_dtype).to(torch_device)
     if(topp > 0 and topk > 1):
         ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
     else:
@@ -130,12 +130,9 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
     if torch_device == "npu":
         torch.npu.synchronize()
 
-    assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error"
-
-
-
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
-
+    print("Test passed!")
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
@@ -176,15 +173,16 @@ def test_ascend(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # voc, random_val, topp, topk, temperature
-        (512, 0.92, 0.8, 3, 0.5),
-        (4096, 0.95, 0.9, 5, 1.0),
-        (16384, 0.85, 0.85, 10, 2.0),
-        (512, 0.92, 0, 3, 0.5),
-        (4096, 0.95, 0.9, 1, 1.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (32000, 0.8, 0.8, 50, 1.0),
-        (32000, 0.8, 1.0, 25, 1.0),
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
     ]
 
     args = get_args()
@@ -228,4 +226,4 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-    print("Test passed!")
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py
@@ -104,6 +104,17 @@ def test_ascend(lib, test_cases):
         test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
     destroy_handle(lib, handle) 
 
+def test_teco(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "sdaa", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
 if __name__ == "__main__":
     args = get_args()
     test_cases = [
@@ -140,3 +151,5 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
+    if args.teco:
+        test_teco(lib, test_cases)
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
@@ -77,7 +77,6 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
             None,
         )
     )
-
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
     print("Test passed!")
@@ -104,6 +103,14 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "sdaa", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
 def test_ascend(lib, test_cases):
     import torch_npu
     device = DeviceEnum.DEVICE_ASCEND
@@ -158,6 +165,8 @@ def test_ascend(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):