diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 65731dd1..84108c51 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -23,6 +23,7 @@ jobs:
 
     - name: Install Python dependencies
       run: |
+        pip install numpy
         pip install torch
 
     - name: Install xmake
diff --git a/include/device.h b/include/device.h
index 701b6632..4f922fc4 100644
--- a/include/device.h
+++ b/include/device.h
@@ -6,6 +6,7 @@ enum DeviceEnum {
     DevNvGpu,
     DevCambriconMlu,
     DevAscendNpu,
+    DevTecoSDAA,
 };
 
 typedef enum DeviceEnum Device;
diff --git a/include/infinirt.h b/include/infinirt.h
new file mode 100644
index 00000000..ee6d4d69
--- /dev/null
+++ b/include/infinirt.h
@@ -0,0 +1,78 @@
+#ifndef INFINI_RUNTIME_H
+#define INFINI_RUNTIME_H
+
+#if defined(_WIN32)
+#define __export __declspec(dllexport)
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define __export __attribute__((visibility("default")))
+#else
+#define __export
+#endif
+
+#ifdef __cplusplus
+#define __C extern "C"
+#else
+#define __C
+#endif
+#include <stddef.h>
+#include <stdint.h>
+
+typedef enum
+{
+    DEVICE_CPU,
+    DEVICE_NVIDIA,
+    DEVICE_CAMBRICON,
+    DEVICE_ASCEND,
+    DEVICE_TECO,
+} DeviceType;
+
+typedef enum
+{
+    INFINIRT_STATUS_SUCCESS = 0,
+    INFINIRT_STATUS_EXECUTION_FAILED = 1,
+    INFINIRT_STATUS_BAD_DEVICE = 2,
+    INFINIRT_STATUS_DEVICE_NOT_SUPPORTED = 3,
+    INFINIRT_STATUS_DEVICE_MISMATCH = 4,
+    INFINIRT_STATUS_INVALID_ARGUMENT = 5,
+    INFINIRT_STATUS_ILLEGAL_MEMORY_ACCESS = 6,
+    INFINIRT_STATUS_NOT_READY = 7,
+} infinirtStatus_t;
+
+__C __export infinirtStatus_t infinirtInit(DeviceType device);
+
+// Device
+__C __export infinirtStatus_t infinirtDeviceSynchronize(DeviceType device, uint32_t deviceId);
+
+// Stream
+struct infinirtStream;
+typedef struct infinirtStream *infinirtStream_t;
+#define INFINIRT_NULL_STREAM nullptr
+__C __export infinirtStatus_t infinirtStreamCreate(infinirtStream_t *pStream, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtStreamDestroy(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtStreamSynchronize(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetRawStream(void** ptr, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetStreamDeviceInfo(DeviceType* deviceType, uint32_t *deviceId, infinirtStream_t stream);
+
+// Event
+struct infinirtEvent;
+typedef struct infinirtEvent *infinirtEvent_t;
+__C __export infinirtStatus_t infinirtEventCreate(infinirtEvent_t *pEvent, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtEventRecord(infinirtEvent_t event, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtEventQuery(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventSynchronize(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventDestroy(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtStreamWaitEvent(infinirtEvent_t event, infinirtStream_t stream);
+
+// Memory
+__C __export infinirtStatus_t infinirtMalloc(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMallocAsync(void **pMemory, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMallocHost(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtFree(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtFreeAsync(void *ptr, DeviceType device, uint32_t deviceId, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtFreeHost(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtMemcpyH2D(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyH2DAsync(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMemcpyD2H(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpy(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyAsync(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+#endif
diff --git a/operatorspy/devices.py b/operatorspy/devices.py
index 4984502a..25c3e96a 100644
--- a/operatorspy/devices.py
+++ b/operatorspy/devices.py
@@ -3,3 +3,4 @@ class DeviceEnum:
     DEVICE_CUDA = 1
     DEVICE_BANG = 2
     DEVICE_ASCEND = 3
+    DEVICE_TECO = 4
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index 868cc88d..838bec17 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -43,6 +43,7 @@ def find_library_in_ld_path(library_name):
         paths = ld_library_path.split(os.pathsep)
         for path in paths:
             full_path = os.path.join(path, library_name)
+            print(full_path)
             if os.path.isfile(full_path):
                 return full_path
         return None
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index a919b47d..064eead2 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -77,13 +77,15 @@ def test(
 
     ans = matmul(c, beta, a, b, alpha)
 
+
     if a_stride is not None:
         a = rearrange_tensor(a, a_stride)
     if b_stride is not None:
         b = rearrange_tensor(b, b_stride)
     if c_stride is not None:
         c = rearrange_tensor(c, c_stride)
-
+    ans = matmul(c, beta, a, b, alpha)
+    
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
     c_tensor = to_tensor(c, lib)
@@ -99,7 +101,7 @@ def test(
             beta
         )
     )
-
+    print(a.stride(),b.stride(),c.stride())
     workspace_size = c_uint64(0)
     check_error(
         lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
@@ -117,8 +119,7 @@ def test(
             None,
         )
     )
-
-    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
 
     if PROFILE:
         for i in range(NUM_PRERUN):
@@ -157,6 +158,7 @@ def test(
         print(f"    lib time: {elapsed :6f}")
 
     check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
+    print("Test passed!")
 
 
 def test_cpu(lib, test_cases):
@@ -292,6 +294,40 @@ def test_ascend(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
 if __name__ == "__main__":
     test_cases = [
         # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
@@ -352,4 +388,6 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
+    if args.teco:
+        test_sdaa(lib,test_cases)
+    print("Test passed!")
diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py
index 73b90a9d..129eb491 100644
--- a/operatorspy/tests/mlp.py
+++ b/operatorspy/tests/mlp.py
@@ -240,6 +240,37 @@ def test_bang(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+        )
+
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
@@ -307,4 +338,6 @@ def test_bang(lib, test_cases):
         test_bang(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang):
         test_cpu(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     print("Test passed!")
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index 795c2c1a..ea680c57 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     else:
         end = topk
     
-    
-    
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
@@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
 
 def random_sample_0(data):
     return torch.argmax(data)
+
 def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
     print(
         f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
     )
-    
-    data = torch.rand((voc), dtype=x_dtype).to(torch_device)
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    data = data[_perm].to(x_dtype).to(torch_device)
     if(topp > 0 and topk > 1):
         ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
     else:
@@ -130,12 +130,9 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
     if torch_device == "npu":
         torch.npu.synchronize()
 
-    assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error"
-
-
-    
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
-
+    print("Test passed!")
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
@@ -176,15 +173,16 @@ def test_ascend(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # voc, random_val, topp, topk, temperature
-        (512, 0.92, 0.8, 3, 0.5),
-        (4096, 0.95, 0.9, 5, 1.0),
-        (16384, 0.85, 0.85, 10, 2.0),
-        (512, 0.92, 0, 3, 0.5),
-        (4096, 0.95, 0.9, 1, 1.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (32000, 0.8, 0.8, 50, 1.0),
-        (32000, 0.8, 1.0, 25, 1.0),
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
     ]
     
     args = get_args()
@@ -228,4 +226,4 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-    print("Test passed!")
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py
index 005b9d95..1e8cf504 100644
--- a/operatorspy/tests/rearrange.py
+++ b/operatorspy/tests/rearrange.py
@@ -104,6 +104,17 @@ def test_ascend(lib, test_cases):
         test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
     destroy_handle(lib, handle) 
 
+def test_teco(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "sdaa", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
 if __name__ == "__main__":
     args = get_args()
     test_cases = [
@@ -140,3 +151,5 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
+    if args.teco:
+        test_teco(lib, test_cases)
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
index 2241e745..53d774a1 100644
--- a/operatorspy/tests/rms_norm.py
+++ b/operatorspy/tests/rms_norm.py
@@ -77,7 +77,6 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
             None,
         )
     )
-
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
     print("Test passed!")
@@ -104,6 +103,14 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "sdaa", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
 def test_ascend(lib, test_cases):
     import torch_npu
     device = DeviceEnum.DEVICE_ASCEND
@@ -158,6 +165,8 @@ def test_ascend(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py
index 57e4e3b9..6baf7358 100644
--- a/operatorspy/tests/swiglu.py
+++ b/operatorspy/tests/swiglu.py
@@ -79,7 +79,6 @@ def test_out_of_place(
             descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
         )
     )
-
     assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
     print("out-of-place Test passed!")
 
@@ -125,7 +124,6 @@ def test_in_place1(
             descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
         )
     )
-
     assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
     print("in-place1 Test passed!")
 
@@ -234,6 +232,19 @@ def test_ascend(lib, test_cases):
         test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
 
     destroy_handle(lib, handle) 
+def test_teco(lib, test_cases):
+    import torch_sdaa
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "sdaa", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "sdaa", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "sdaa", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle) 
 
 
 if __name__ == "__main__":
@@ -278,3 +289,5 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
+    if args.teco:
+        test_teco(lib, test_cases)
diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py
index a00a91ec..471f2326 100644
--- a/operatorspy/tests/test_utils.py
+++ b/operatorspy/tests/test_utils.py
@@ -22,5 +22,10 @@ def get_args():
         action="store_true",
         help="Run ASCEND NPU test",
     )
+    parser.add_argument(
+        "--teco",
+        action="store_true",
+        help="Run TECO SDAA test",
+    )
 
     return parser.parse_args()
diff --git a/src/devices/handle.cc b/src/devices/handle.cc
index 97126a9d..ef56b2f8 100644
--- a/src/devices/handle.cc
+++ b/src/devices/handle.cc
@@ -11,7 +11,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "./ascend/ascend_handle.h"
 #endif
-
+#ifdef ENABLE_TECO_SDAA
+#include "./teco/teco_handle.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
     if (handle_ptr == nullptr) {
@@ -40,6 +42,11 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device d
         case DevAscendNpu: {
             return createAscendHandle((AscendHandle_t *) handle_ptr, device_id);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return createTecoHandle((TecoHandle_t *) handle_ptr, device_id);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -68,6 +75,11 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
         case DevAscendNpu: {
             return deleteAscendHandle((AscendHandle_t) handle);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return deleteTecoHandle((TecoHandle_t) handle);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
new file mode 100644
index 00000000..2de5722d
--- /dev/null
+++ b/src/devices/teco/common_teco.cc
@@ -0,0 +1,96 @@
+#include "common_teco.h"
+void const** convertToBatch(void const* data, int batch, int stride, size_t typeSize){
+    void const **output = (void const **)malloc(batch * sizeof(void const *));
+    if (output == NULL) {
+        return NULL;
+    }
+
+    const uint8_t *charData = (const uint8_t *)data;
+
+    for (int i = 0; i < batch; i++) {
+        output[i] = (const void *)(charData + i * stride * typeSize);
+    }
+
+    return output;
+}
+
+bool is_contiguous(MatrixInfo desc) {
+    if (desc.ei!= 1) {
+        return false;
+    }else
+        return true;  
+}
+
+infiniopStatus_t restoreTensor(MatrixInfo desc, void *data,tecodnnDataType_t datatype) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    tecodnnTensorDescriptor_t src,dst;
+    tecodnnCreateTensorDescriptor(&src);
+    tecodnnCreateTensorDescriptor(&dst);
+    int *dst_strides = new int[desc.ndim];
+    int *src_strides = new int[desc.ndim];
+    int *shape = new int[desc.ndim];
+    dst_strides[0] = desc.cols;
+    dst_strides[1] = 1; 
+    src_strides[0] = desc.ld;
+    src_strides[1] = desc.ei;
+    shape[0] = desc.rows;
+    shape[1] = desc.cols;
+    size_t size = shape[1]*shape[0];
+    if(datatype==TECODNN_DATA_HALF)
+        size*=sizeof(uint16_t);
+    else
+        size*=sizeof(uint32_t);
+    void *temp;
+    sdaaMalloc(&temp,size);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc.ndim,shape,dst_strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc.ndim,shape,src_strides);
+    tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
+    sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
+    sdaaFree(temp);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t toContiguous(MatrixInfo desc, void *data,tecodnnDataType_t datatype) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    tecodnnTensorDescriptor_t src,dst;
+    tecodnnCreateTensorDescriptor(&src);
+    tecodnnCreateTensorDescriptor(&dst);
+    int *dst_strides = new int[desc.ndim];
+    int *src_strides = new int[desc.ndim];
+    int *shape = new int[desc.ndim];
+    dst_strides[0] = desc.cols;
+    dst_strides[1] = 1; 
+    src_strides[0] = desc.ld;
+    src_strides[1] = desc.ei;
+    shape[0] = desc.rows;
+    shape[1] = desc.cols;
+    size_t size = shape[1]*shape[0];
+    if(datatype==TECODNN_DATA_HALF){
+        size*=sizeof(uint16_t);
+    }
+    else{
+        size*=sizeof(uint32_t);
+    }
+    void *temp;
+    sdaaMalloc(&temp,size);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc.ndim,shape,src_strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc.ndim,shape,dst_strides);
+    tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
+    sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
+    sdaaFree(temp);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src, tecodnnTensorDescriptor_t des) {
+    tecodnnDataType_t data_type;
+    if(src->dt==F16)
+        data_type = TECODNN_DATA_HALF;
+    tecodnnSetTensor4dDescriptor(des,TECODNN_TENSOR_NCHW,data_type,src->shape[0],src->shape[1],1,1);
+    return STATUS_SUCCESS;
+}
+
+
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
new file mode 100644
index 00000000..ba2d8ed2
--- /dev/null
+++ b/src/devices/teco/common_teco.h
@@ -0,0 +1,65 @@
+#ifndef _COMMON_TECO_
+#define _COMMON_TECO_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include <tecodnn.h>
+#include "device.h"
+#include "operators.h"
+#include <iostream>
+#define CHECK_TECOBLAS(expression)                                                               \
+    {                                                                                            \
+        tecoblasStatus_t status = (expression);                                                  \
+        if (status != TECOBLAS_STATUS_SUCCESS) {                                                 \
+            fprintf(stderr, "Error at line %d: %s\n", __LINE__, tecoblasGetErrorString(status)); \
+            exit(EXIT_FAILURE);                                                                  \
+        }                                                                                        \
+    }
+
+typedef struct MatrixInfo {
+    int ndim;
+    int batch;
+    int64_t stride;
+    int rows;
+    int cols;
+    int ld;
+    int ei;
+
+    MatrixInfo() {}
+
+    MatrixInfo(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
+        if (layout->ndim == 2) {
+            this->ndim = 2;
+            this->batch = 1;
+            this->stride = 0;
+            this->rows = layout->shape[0];
+            this->cols = layout->shape[1];
+            this->ld = layout->strides[0];
+            this->ei = layout->strides[1];
+        } else if (layout->ndim == 3) {
+            this->ndim = 3;
+            this->batch = layout->shape[0];
+            this->stride = this->batch == 1 ? 0 : layout->strides[0];
+            this->rows = layout->shape[1];
+            this->cols = layout->shape[2];
+            this->ld = layout->strides[1];
+            this->ei = layout->strides[2];
+        } else {
+            *status = STATUS_BAD_TENSOR_SHAPE;
+            return;
+        }
+
+        *status = STATUS_SUCCESS;
+    }
+
+} MatrixInfo;
+void const** convertToBatch(void const* data, int batch, int stride, size_t typeSize);
+bool is_contiguous(MatrixInfo desc);
+infiniopStatus_t toContiguous(MatrixInfo desc,void *data,tecodnnDataType_t datatype);
+infiniopStatus_t restoreTensor(MatrixInfo desc,void *data,tecodnnDataType_t datatype);
+
+infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src,tecodnnTensorDescriptor_t des);
+
+#endif
\ No newline at end of file
diff --git a/src/devices/teco/teco_handle.cc b/src/devices/teco/teco_handle.cc
new file mode 100644
index 00000000..c816808d
--- /dev/null
+++ b/src/devices/teco/teco_handle.cc
@@ -0,0 +1,22 @@
+#include "teco_handle.h"
+
+infiniopStatus_t createTecoHandle(TecoHandle_t *handle_ptr, int device_id) {
+    uint32_t device_count;
+    sdaaGetDeviceCount(reinterpret_cast<int*>(&device_count));
+    if (device_id >= static_cast<int>(device_count)) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    sdaaSetDevice(device_id);
+    sdaaStream_t stream;
+    sdaaStreamCreate(&stream);
+    *handle_ptr = new TecoContext{DevTecoSDAA, device_id,stream};
+    
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteTecoHandle(TecoHandle_t handle_ptr) {
+    sdaaStreamDestroy(handle_ptr->stream);
+    delete handle_ptr;
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/teco/teco_handle.h b/src/devices/teco/teco_handle.h
new file mode 100644
index 00000000..43b0794e
--- /dev/null
+++ b/src/devices/teco/teco_handle.h
@@ -0,0 +1,17 @@
+#ifndef __TECO_HANDLE__
+#define __TECO_HANDLE__
+#include "common_teco.h"
+#include "status.h"
+#include "../pool.h"
+struct TecoContext {
+    Device device;
+    int device_id;
+    sdaaStream_t stream;
+};
+typedef struct TecoContext *TecoHandle_t;
+
+infiniopStatus_t createTecoHandle(TecoHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteTecoHandle(TecoHandle_t handle_ptr);
+
+#endif
diff --git a/src/ops/add/teco/add_tecodnn.cpp b/src/ops/add/teco/add_tecodnn.cpp
new file mode 100644
index 00000000..0ca25c3d
--- /dev/null
+++ b/src/ops/add/teco/add_tecodnn.cpp
@@ -0,0 +1,18 @@
+#include "add_tecodnn.h"
+
+infiniopStatus_t tecoCreateAddDescriptor(TecoHandle_t handle, AddTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t add_teco(AddCpuDescriptor_t desc, void *c, void const *a, void const *b){
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoAdd(AddTecoDescriptor_t desc, void *c, const void *a, const void *b, void *stream) {
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroyAddDescriptor(AddTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/add/teco/add_tecodnn.h b/src/ops/add/teco/add_tecodnn.h
new file mode 100644
index 00000000..0b7a729c
--- /dev/null
+++ b/src/ops/add/teco/add_tecodnn.h
@@ -0,0 +1,40 @@
+#ifndef __TECO_ADD_H__
+#define __TECO_ADD_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+
+struct AddTecoDescriptor {
+    Device device;
+    int device_id;
+    tecodnnHandle_t handle;
+    sdaaStream_t stream;
+    tecoblasOperation_t transa,transb;
+    int m,n,k;
+    float alpha,beta;
+    int lda,ldb,ldc;
+    int batch;
+    long long int strideA,strideB,strideC;
+};
+
+typedef struct AddTecoDescriptor *AddTecoDescriptor_t;
+
+
+infiniopStatus_t tecoCreateAddDescriptor(TecoHandle_t handle,
+                                             AddTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t tecoAdd(AddTecoDescriptor_t desc,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t tecoDestroyAddDescriptor(AddTecoDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/matmul/operator.cc b/src/ops/matmul/operator.cc
index 444168b6..52a99e81 100644
--- a/src/ops/matmul/operator.cc
+++ b/src/ops/matmul/operator.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/matmul_aclnn.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/matmul_tecoblas.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
                                                     infiniopMatmulDescriptor_t *desc_ptr,
@@ -48,6 +51,17 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
                                                beta,
                                                1);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoCreateMatmulDescriptor((TecoHandle_t) handle,
+                                               (MatmulTecoDescriptor_t *) desc_ptr,
+                                               c_desc,
+                                               alpha,
+                                               a_desc,
+                                               b_desc,
+                                               beta);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -75,8 +89,15 @@ __C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t d
             return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
                                                size);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoGetMatmulWorkspaceSize((MatmulTecoDescriptor_t) desc,
+                                               size);
+        }
 #endif
     }
+    
     return STATUS_BAD_DEVICE;
 }
 
@@ -104,6 +125,17 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *works
                                a,
                                b,
                                stream);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoMatmul((MatmulTecoDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               c,
+                               a,
+                               b,
+                               stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -130,6 +162,11 @@ __C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t
         case DevAscendNpu: {
             return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyMatmulDescriptor((MatmulTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
new file mode 100644
index 00000000..b1b1d9ad
--- /dev/null
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -0,0 +1,166 @@
+#include "matmul_tecoblas.h"
+
+infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
+    infiniopStatus_t status = STATUS_SUCCESS;
+    tecoblasDataType_t datatype;
+    tecoblasOperation_t transA,transB,transC;
+    uint64_t m,k,n;
+    long long int lda,ldb,ldc;
+    long long int batch,batch_count;
+    long int strideA = 1,strideB = 1,strideC = 1;
+    if (a_desc->ndim == 2 && b_desc->ndim == 2) {
+        batch = 0;
+        batch_count = 1;
+    }else if(a_desc->ndim == 3 && b_desc->ndim == 3){
+        batch = 1;
+        batch_count = a_desc->shape[0];
+        strideA = a_desc->strides[0];
+        strideB = b_desc->strides[0];
+        strideC = c_desc->strides[0];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    /*MatrixA*/
+    if(a_desc->strides[1+batch] == 1 && (uint64_t)a_desc->strides[0+batch] >= a_desc->shape[1+batch]){
+        transA = TECOBLAS_OP_N;
+        m = a_desc->shape[0+batch];
+        k = a_desc->shape[1+batch];
+        lda = a_desc->strides[0+batch];
+    }else if(a_desc->strides[0+batch] == 1 && (uint64_t)a_desc->strides[1+batch] >= a_desc->shape[0+batch]){
+        transA = TECOBLAS_OP_T;
+        m = a_desc->shape[0+batch];
+        k = a_desc->shape[1+batch];
+        lda = a_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    /*MatrixB*/
+    if(b_desc->strides[1+batch] == 1 && (uint64_t)b_desc->strides[0+batch] >= b_desc->shape[1+batch]){
+        transB = TECOBLAS_OP_N;
+        k = b_desc->shape[0+batch];
+        n = b_desc->shape[1+batch];
+        ldb = b_desc->strides[0+batch];
+    }else if(b_desc->strides[0+batch] == 1 && (uint64_t)b_desc->strides[1+batch] >= b_desc->shape[0+batch]){
+        transB = TECOBLAS_OP_T;
+        k = b_desc->shape[0+batch];
+        n = b_desc->shape[1+batch];
+        ldb = b_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    /*MatrixC*/
+    if(c_desc->strides[1+batch] == 1 && (uint64_t)c_desc->strides[0+batch] >= c_desc->shape[1+batch]){
+        transC = TECOBLAS_OP_N;
+        m = c_desc->shape[0+batch];
+        n = c_desc->shape[1+batch];
+        ldc = c_desc->strides[0+batch];
+    }else if(c_desc->strides[0+batch] == 1 && (uint64_t)c_desc->strides[1+batch] >= c_desc->shape[0+batch]){
+        transC = TECOBLAS_OP_T;
+        m = c_desc->shape[0+batch];
+        n = c_desc->shape[1+batch];
+        ldc = c_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    
+    if(a_desc->dt==F16 && b_desc->dt==F16){
+        datatype = TECOBLAS_DATA_HALF;
+    }else if(a_desc->dt==F32 && b_desc->dt==F32){
+        datatype = TECOBLAS_DATA_FLOAT;
+    }else{
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    tecoblasHandle_t tecoblas_handle;
+    tecoblasCreate(&tecoblas_handle);
+
+    *desc_ptr = new MatmulTecoDescriptor{
+        handle->device,
+        handle->device_id,
+        tecoblas_handle,
+        handle->stream,
+        datatype,
+        transA,
+        transB,
+        transC,
+        m,
+        k,
+        n,
+        alpha,
+        beta,
+        lda,
+        ldb,
+        ldc,
+        batch,
+        batch_count,
+        strideA,
+        strideB,
+        strideC,
+        };
+    tecoblasSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
+        
+    return status;
+}
+
+infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
+    tecoblasAPIName_t apiName;
+    if (desc->batch == 0)
+    {
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            apiName = TECOBLAS_HGEMM;
+        else
+            apiName = TECOBLAS_SGEMM;
+    }else{
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            apiName = TECOBLAS_HGEMM_STRIDED_BATCHED;
+        else
+            apiName = TECOBLAS_SGEMM_STRIDED_BATCHED;
+    }
+    CHECK_TECOBLAS(tecoblasGetWorkspaceSize(
+        desc->handle, 
+        desc->transa, 
+        desc->transb, 
+        desc->m, 
+        desc->n, 
+        desc->k, 
+        desc->alpha, 
+        desc->datatype,
+        desc->lda, 
+        desc->strideA, 
+        desc->datatype, 
+        desc->ldb, 
+        desc->strideB, 
+        desc->beta, 
+        desc->datatype, 
+        desc->ldc, 
+        desc->strideC, 
+        desc->batch_count, 
+        apiName,
+        reinterpret_cast<size_t*>(size)))
+    
+
+        
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, const void *a, const void *b, void *stream) {
+    tecoblasSetStream(desc->handle, desc->stream);
+    tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
+    if(desc->batch==0){
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
+        else
+            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
+    }else{
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            CHECK_TECOBLAS(tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
+        else
+            CHECK_TECOBLAS(tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
+    }
+    sdaaStreamSynchronize(desc->stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroyMatmulDescriptor(MatmulTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
new file mode 100644
index 00000000..2b5ec78c
--- /dev/null
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -0,0 +1,44 @@
+#ifndef __TECO_MATMUL_H__
+#define __TECO_MATMUL_H__
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include "../../../devices/teco/teco_handle.h"
+struct MatmulTecoDescriptor {
+    Device device;
+    int device_id;
+    tecoblasHandle_t handle;
+    sdaaStream_t stream;
+    tecoblasDataType_t datatype;
+    tecoblasOperation_t transa,transb,transc;
+    uint64_t m,k,n;
+    float alpha,beta;
+    long long int lda,ldb,ldc;
+    long long int batch,batch_count;
+    long int strideA,strideB,strideC;};
+
+typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle,
+                                             MatmulTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta);
+
+infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc,
+                                             uint64_t *size);
+
+infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t tecoDestroyMatmulDescriptor(MatmulTecoDescriptor_t desc);
+
+
+#endif
\ No newline at end of file
diff --git a/src/ops/rearrange/operator.cc b/src/ops/rearrange/operator.cc
index a1084d48..f0aef0af 100644
--- a/src/ops/rearrange/operator.cc
+++ b/src/ops/rearrange/operator.cc
@@ -17,6 +17,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/rearrange_aclnn.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/rearrange_tecodnn.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
     infiniopHandle_t handle,
@@ -46,6 +49,13 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
                                                   dst,
                                                   src);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: 
+            return tecoCreateRearrangeDescriptor((TecoHandle_t) handle,
+                                                (RearrangeTecoDescriptor_t *) desc_ptr,
+                                                dst,
+                                                src);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -75,6 +85,14 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
                                   src,
                                   stream);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoRearrange((RearrangeTecoDescriptor_t) desc,
+                                  dst,
+                                  src,
+                                  stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -101,6 +119,11 @@ __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescrip
         case DevAscendNpu: {
             return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyRearrangeDescriptor((RearrangeTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/rearrange/teco/rearrange_tecodnn.cc b/src/ops/rearrange/teco/rearrange_tecodnn.cc
new file mode 100644
index 00000000..00abbc2a
--- /dev/null
+++ b/src/ops/rearrange/teco/rearrange_tecodnn.cc
@@ -0,0 +1,51 @@
+#include "rearrange_tecodnn.h"
+
+infiniopStatus_t tecoCreateRearrangeDescriptor(TecoHandle_t handle, RearrangeTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+
+    tecodnnTensorDescriptor_t srcDesc,dstDesc;
+    tecodnnCreateTensorDescriptor(&srcDesc);
+    tecodnnCreateTensorDescriptor(&dstDesc);
+
+    int nbDims = dst->ndim;
+
+    int *shape = new int[nbDims];
+    int *src_strides = new int[nbDims];
+    int *dst_strides = new int[nbDims];
+    for (size_t i = 0; i < (size_t)nbDims; i++)
+    {
+        shape[i] = dst->shape[i];
+        src_strides[i] = src->strides[i];
+        dst_strides[i] = dst->strides[i];
+    }
+
+    tecodnnSetTensorNdDescriptor(srcDesc, TECODNN_DATA_HALF, nbDims, shape, src_strides);
+    tecodnnSetTensorNdDescriptor(dstDesc, TECODNN_DATA_HALF, nbDims, shape, dst_strides);
+    
+    
+    *desc_ptr = new RearrangeTecoDescriptor{
+        DevTecoSDAA,
+        handle->device_id,
+        handle->stream,
+        tecodnn_handle,
+        nbDims,
+        shape,
+        src_strides,
+        dst_strides,
+        srcDesc,
+        dstDesc,
+    };
+
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoRearrange(RearrangeTecoDescriptor_t desc, void *dst, void const *src, void *stream) {
+    tecodnnCopyStride(desc->handle, desc->srcDesc, src, desc->dstDesc, dst);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroyRearrangeDescriptor(RearrangeTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/teco/rearrange_tecodnn.h b/src/ops/rearrange/teco/rearrange_tecodnn.h
new file mode 100644
index 00000000..2df2b073
--- /dev/null
+++ b/src/ops/rearrange/teco/rearrange_tecodnn.h
@@ -0,0 +1,33 @@
+#ifndef __TECO_REARRANGE_H__
+#define __TECO_REARRANGE_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+struct RearrangeTecoDescriptor {
+    Device device;
+    int device_id;
+    sdaaStream_t stream;
+    tecodnnHandle_t handle;
+    int nbDims;
+    int *shape,*src_strides,*dst_strides;
+    tecodnnTensorDescriptor_t srcDesc,dstDesc;
+};
+
+typedef struct RearrangeTecoDescriptor *RearrangeTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateRearrangeDescriptor(TecoHandle_t handle,
+                                             RearrangeTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t dst,                                             
+                                             infiniopTensorDescriptor_t src);
+
+infiniopStatus_t tecoRearrange(RearrangeTecoDescriptor_t desc,
+                            void *dst,
+                            void const *src,
+                            void *stream);
+
+infiniopStatus_t tecoDestroyRearrangeDescriptor(RearrangeTecoDescriptor_t desc);
+
+
+#endif
\ No newline at end of file
diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc
index 9aa4b206..4cdac453 100644
--- a/src/ops/rms_norm/operator.cc
+++ b/src/ops/rms_norm/operator.cc
@@ -17,6 +17,10 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/rms_norm_aclnn.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/rms_norm_teco.h"
+#endif
+
 
 __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
     infiniopHandle_t handle,
@@ -49,6 +53,11 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
                                                 w_desc,
                                                 epsilon);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoCreateRMSNormDescriptor((TecoHandle_t) handle, (RMSNormTecoDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -76,6 +85,11 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
             return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
                                                 size);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoGetRMSNormWorkspaceSize((RMSNormTecoDescriptor_t) desc, size);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -109,6 +123,17 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
                                 w,
                                 stream);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoRMSNorm((RMSNormTecoDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               y,
+                               x,
+                               w,
+                               stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -136,6 +161,11 @@ __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_
             return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
         }
 
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyRMSNormDescriptor((RMSNormTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.cc b/src/ops/rms_norm/teco/rms_norm_teco.cc
new file mode 100644
index 00000000..feb2869f
--- /dev/null
+++ b/src/ops/rms_norm/teco/rms_norm_teco.cc
@@ -0,0 +1,90 @@
+#include "rms_norm_teco.h"
+
+
+infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, infiniopTensorDescriptor_t w_desc, float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         c = y_desc->shape[1];
+    unsigned long h = 1,
+         w = 1;
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != c || w_desc->shape[0] != c) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    // sdaaStream_t stream;
+    // sdaaStreamCreate(&stream);
+    tecodnnTensorDescriptor_t x_desc_teco,y_desc_teco,w_desc_teco,rms_desc_teco;
+    tecodnnCreateTensorDescriptor(&x_desc_teco);
+    tecodnnCreateTensorDescriptor(&y_desc_teco);
+    tecodnnCreateTensorDescriptor(&w_desc_teco);
+    tecodnnCreateTensorDescriptor(&rms_desc_teco);
+    // toTecodnnTensorDescriptor(x_desc,x_desc_teco);
+    // toTecodnnTensorDescriptor(y_desc,y_desc_teco);
+    // toTecodnnTensorDescriptor(w_desc,w_desc_teco);
+    // tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,x_desc->shape[0],1,1,x_desc->shape[1]);
+    // tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,y_desc->shape[0],1,1,y_desc->shape[1]);
+    // if(w_desc->dt==F16)
+    //     tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,w_desc->shape[0]);
+    // if(w_desc->dt==F32)
+    //     tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,w_desc->shape[0]);
+    // tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,1,1,1);
+    
+    if(w_desc->dt==F16){
+        tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,c);
+        tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
+    }
+        
+    if(w_desc->dt==F32){
+        tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,c);
+        tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
+    }
+    *desc_ptr = new RMSNormTecoDescriptor{
+        handle->device,
+        tecodnn_handle,
+        handle->stream,
+        epsilon,
+        x_desc_teco,
+        y_desc_teco,
+        w_desc_teco,
+        rms_desc_teco,
+        n,
+        c,
+        };
+    tecodnnSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint64_t *size) {
+    *size = (desc->n)*(desc->c)*4;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream) {
+    tecodnnSetStream(desc->handle, desc->stream);
+    tecodnnStatus_t status;
+    
+    // void *rms = malloc(workspace_size * sizeof(uint16_t));
+    status = tecodnnRMSNormForward(desc->handle, desc->eps, desc->xDesc,x,desc->wDesc,w,desc->yDesc,y,desc->rmsDesc,workspace);
+    sdaaStreamSynchronize(desc->stream);
+    if (status != TECODNN_STATUS_SUCCESS) {
+        printf("%s\n",tecodnnGetErrorString(status));
+        return STATUS_EXECUTION_FAILED;
+    }else{
+        return STATUS_SUCCESS;
+    }
+}
+
+infiniopStatus_t tecoDestroyRMSNormDescriptor(RMSNormTecoDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.h b/src/ops/rms_norm/teco/rms_norm_teco.h
new file mode 100644
index 00000000..28a2de32
--- /dev/null
+++ b/src/ops/rms_norm/teco/rms_norm_teco.h
@@ -0,0 +1,35 @@
+#ifndef __TECO_RMS_NORM_H__
+#define __TECO_RMS_NORM_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+
+struct RMSNormTecoDescriptor {
+    Device device;
+    tecodnnHandle_t handle;
+    sdaaStream_t stream;
+    float eps;
+    tecodnnTensorDescriptor_t xDesc,yDesc,wDesc,rmsDesc;
+    unsigned long n,c;
+};
+
+typedef struct RMSNormTecoDescriptor *RMSNormTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc, float epsilon);
+
+infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *y, void const *x, void const *w, 
+                                  void *stream);
+
+infiniopStatus_t tecoDestroyRMSNormDescriptor(RMSNormTecoDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
index a5f32a97..62579c3d 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
@@ -53,6 +53,8 @@ infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
     if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
         return STATUS_BAD_PARAM;
 
+    checkCudaError(cudaSetDevice(desc->device_id));
+
     if (dtype_eq(desc->dtype, F16)) {
         rotary_embedding_nv_gpu_f16(desc,
                                     reinterpret_cast<half *>(t),
diff --git a/src/ops/swiglu/cuda/swiglu.cu b/src/ops/swiglu/cuda/swiglu.cu
index a17e994b..c02ce186 100644
--- a/src/ops/swiglu/cuda/swiglu.cu
+++ b/src/ops/swiglu/cuda/swiglu.cu
@@ -59,6 +59,8 @@ infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc,
                             void const *a,
                             void const *b,
                             void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+
     if (dtype_eq(desc->dtype, F16)) {
         swiglu_nv_gpu_f16(desc, c, a, b, stream);
         return STATUS_SUCCESS;
diff --git a/src/ops/swiglu/cuda/swiglu.cuh b/src/ops/swiglu/cuda/swiglu.cuh
index eed0be5b..9b3bdcb5 100644
--- a/src/ops/swiglu/cuda/swiglu.cuh
+++ b/src/ops/swiglu/cuda/swiglu.cuh
@@ -6,6 +6,7 @@
 
 struct SwiGLUCudaDescriptor {
     Device device;
+    int device_id;
     DT dtype;
     uint64_t seq_len;
     uint64_t di;
diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cc b/src/ops/swiglu/cuda/swiglu_cuda.cc
index 1f5eb944..16d70503 100644
--- a/src/ops/swiglu/cuda/swiglu_cuda.cc
+++ b/src/ops/swiglu/cuda/swiglu_cuda.cc
@@ -35,6 +35,7 @@ infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle,
     }
 
     *desc_ptr = new SwiGLUCudaDescriptor{DevNvGpu,
+                                         handle->device_id,
                                          dtype,
                                          seq_len,
                                          di,
diff --git a/src/ops/swiglu/operator.cc b/src/ops/swiglu/operator.cc
index b0bcb35c..3d2c32ee 100644
--- a/src/ops/swiglu/operator.cc
+++ b/src/ops/swiglu/operator.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/swiglu.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/swiglu_sdaa.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
                                                     infiniopSwiGLUDescriptor_t *desc_ptr,
@@ -45,6 +48,14 @@ __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
                                                 c_desc,
                                                 a_desc,
                                                 b_desc);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: 
+            return tecoCreateSwiGLUDescriptor((TecoHandle_t) handle,
+                                                (SwiGLUTecoDescriptor_t *) desc_ptr,
+                                                c_desc,
+                                                a_desc,
+                                                b_desc);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -72,6 +83,10 @@ __C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu:
             return ascendSwiGLU((SwiGLUAscendDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA:
+            return tecoSwiGLU((SwiGLUTecoDescriptor_t) desc, c, a, b, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -95,6 +110,10 @@ __C infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu:
             return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t) desc);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA:
+            return tecoDestroySwiGLUDescriptor((SwiGLUTecoDescriptor_t) desc);
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/swiglu/teco/swiglu_sdaa.h b/src/ops/swiglu/teco/swiglu_sdaa.h
new file mode 100644
index 00000000..f41f325f
--- /dev/null
+++ b/src/ops/swiglu/teco/swiglu_sdaa.h
@@ -0,0 +1,32 @@
+#ifndef __SDAA_SWIGLU_H__
+#define __SDAA_SWIGLU_H__
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include "../../../devices/teco/teco_handle.h"
+struct SwiGLUTecoDescriptor {
+    Device device;
+    int device_id;
+    sdaaStream_t stream;
+    uint64_t rows,cols;
+    int64_t lda,ldb,ldc;
+};
+
+typedef struct SwiGLUTecoDescriptor *SwiGLUTecoDescriptor_t;
+
+
+infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle,
+                                             SwiGLUTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc);
+
+
+#endif
\ No newline at end of file
diff --git a/src/ops/swiglu/teco/swiglu_sdaa.scpp b/src/ops/swiglu/teco/swiglu_sdaa.scpp
new file mode 100644
index 00000000..c97c6410
--- /dev/null
+++ b/src/ops/swiglu/teco/swiglu_sdaa.scpp
@@ -0,0 +1,70 @@
+#include "swiglu_sdaa.h"
+__local__ halfv16 tempa, tempb, tempc;
+
+__device__ void silu_halfv16(halfv16 *c, halfv16 *a, halfv16 *b) {
+    floatv16 one_v = simd_stretch(1.0f);
+    floatv16 a_silu = simd_div(simd_cvt_h2f(*b), simd_add(one_v, simd_exp(0 - simd_cvt_h2f(*b))));
+    halfv16 out = simd_cvt_f2h(simd_mul(simd_cvt_h2f(*a), a_silu));
+    *c = out;
+}
+
+__device__ void silu_half(half *c, const half *a, const half *b) {
+    *c = (*b) * (*a)/ (1.0 + expf(0 - *b)) ;
+}
+
+__global__ void swiglu(half *c, half const *a, half const *b, size_t rows, size_t cols, size_t lda, size_t ldb, size_t ldc) {
+    int vector_size = 16;
+    for (size_t i = 0; i < rows / threadDim + 1; i++) {
+        if (threadIdx < rows - i * threadDim) {
+            size_t j = 0;
+            for (; j < cols / vector_size; j++) {
+                simd_load(tempa, a + (threadIdx + i * threadDim) * lda + j * vector_size);
+                simd_load(tempb, b + (threadIdx + i * threadDim) * ldb + j * vector_size);
+                silu_halfv16(&tempc, &tempa, &tempb);
+                simd_store(tempc, c + (threadIdx + i * threadDim) * ldc + j * vector_size);
+            }
+            for (size_t k = 0; k < cols - j * vector_size; k++)
+            {
+                silu_half(
+                c + (threadIdx + i * threadDim) * ldc + j * vector_size + k,
+                a + (threadIdx + i * threadDim) * lda + j * vector_size + k,
+                b + (threadIdx + i * threadDim) * ldb + j * vector_size + k);
+            }
+            
+        }
+    }
+}
+
+infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle,
+                                            SwiGLUTecoDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    *desc_ptr = new SwiGLUTecoDescriptor{
+        handle->device,
+        handle->device_id,
+        handle->stream,
+        a_desc->shape[0],
+        a_desc->shape[1],
+        a_desc->strides[0],
+        b_desc->strides[0],
+        c_desc->strides[0],
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    auto a_ptr = reinterpret_cast<const half *>(a);
+    auto b_ptr = reinterpret_cast<const half *>(b);
+    auto c_ptr = reinterpret_cast<half *>(c);
+    swiglu<<<1>>>(c_ptr, a_ptr, b_ptr, desc->rows, desc->cols, desc->lda, desc->ldb, desc->ldc);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/utils.h b/src/ops/utils.h
index b48cf419..f0e64fee 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -4,6 +4,7 @@
 #include "data_type.h"
 #include "tensor.h"
 #include <algorithm>
+#include <functional>
 #include <iostream>
 #include <numeric>
 #include <stdio.h>
diff --git a/wget-log b/wget-log
new file mode 100644
index 00000000..e6c76450
--- /dev/null
+++ b/wget-log
@@ -0,0 +1,11 @@
+--2024-11-25 07:56:20--  https://raw.githubusercontent.com/tboox/xmake/master/scripts/get.sh
+Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
+Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 8113 (7.9K) [text/plain]
+Saving to: 'STDOUT'
+
+-                         0%[                             ]       0  --.-KB/s               -                       100%[============================>]   7.92K  --.-KB/s    in 0.07s   
+
+2024-11-25 07:56:21 (119 KB/s) - written to stdout [8113/8113]
+
diff --git a/xmake.lua b/xmake.lua
index 327e91ef..64b406ff 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -40,6 +40,14 @@ option("ascend-npu")
     add_defines("ENABLE_ASCEND_NPU")
 option_end()
 
+option("teco")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Teco kernel")
+    add_defines("ENABLE_TECO_SDAA")
+option_end()
+
+
 if is_mode("debug") then
     add_cxflags("-g -O0")
     add_defines("DEBUG_MODE")
@@ -212,6 +220,52 @@ if has_config("ascend-npu") then
     target_end()
 end
 
+if has_config("teco") then
+
+    add_defines("ENABLE_TECO_SDAA")
+    add_includedirs("/opt/tecoai/include")
+    add_linkdirs("/opt/tecoai/lib64")
+    add_links("libsdaart.so")
+    add_links("libtecoblas.so")
+    add_links("libtecodnn.so")
+
+    rule("scpp")
+        set_extensions(".scpp")
+
+        on_load(function (target)
+            target:add("includedirs", "include")
+        end)
+
+        on_build_file(function (target, sourcefile)
+            local objectfile = target:objectfile(sourcefile)
+            os.mkdir(path.directory(objectfile))
+
+            local cc = "/opt/tecoai/bin/tecocc"
+
+            local includedirs = table.concat(target:get("includedirs"), " ")
+            local args = {sourcefile, "-o", objectfile,"-O2", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread","-c"}
+            
+            for _, includedir in ipairs(target:get("includedirs")) do
+                table.insert(args, "-I" .. includedir)
+            end
+
+            os.execv(cc, args)
+            table.insert(target:objectfiles(), objectfile)
+        end)
+
+    rule_end()
+
+
+    target("teco")
+        set_kind("static")
+        set_languages("cxx17")
+        add_files("src/devices/teco/*.cc", "src/ops/*/teco/*.cc")
+        add_files("src/ops/*/teco/*.scpp", {rule = "scpp"})
+        add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+    target_end()
+
+end
+
 target("infiniop")
     set_kind("shared")
 
@@ -227,6 +281,9 @@ target("infiniop")
     if has_config("ascend-npu") then
         add_deps("ascend-npu")
     end
+    if has_config("teco") then
+        add_deps("teco")
+    end
     set_languages("cxx17")
     add_files("src/devices/handle.cc")
     add_files("src/ops/*/operator.cc")