Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:

- name: Install Python dependencies
run: |
pip install numpy
pip install torch

- name: Install xmake
Expand Down
1 change: 1 addition & 0 deletions include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ enum DeviceEnum {
DevNvGpu,
DevCambriconMlu,
DevAscendNpu,
DevTecoSDAA,
};

typedef enum DeviceEnum Device;
Expand Down
78 changes: 78 additions & 0 deletions include/infinirt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#ifndef INFINI_RUNTIME_H
#define INFINI_RUNTIME_H

#if defined(_WIN32)
#define __export __declspec(dllexport)
#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
#define __export __attribute__((visibility("default")))
#else
#define __export
#endif

#ifdef __cplusplus
#define __C extern "C"
#else
#define __C
#endif
#include <stddef.h>
#include <stdint.h>

typedef enum
{
DEVICE_CPU,
DEVICE_NVIDIA,
DEVICE_CAMBRICON,
DEVICE_ASCEND,
DEVICE_TECO,
} DeviceType;

typedef enum
{
INFINIRT_STATUS_SUCCESS = 0,
INFINIRT_STATUS_EXECUTION_FAILED = 1,
INFINIRT_STATUS_BAD_DEVICE = 2,
INFINIRT_STATUS_DEVICE_NOT_SUPPORTED = 3,
INFINIRT_STATUS_DEVICE_MISMATCH = 4,
INFINIRT_STATUS_INVALID_ARGUMENT = 5,
INFINIRT_STATUS_ILLEGAL_MEMORY_ACCESS = 6,
INFINIRT_STATUS_NOT_READY = 7,
} infinirtStatus_t;

__C __export infinirtStatus_t infinirtInit(DeviceType device);

// Device
__C __export infinirtStatus_t infinirtDeviceSynchronize(DeviceType device, uint32_t deviceId);

// Stream
struct infinirtStream;
typedef struct infinirtStream *infinirtStream_t;
#define INFINIRT_NULL_STREAM nullptr
__C __export infinirtStatus_t infinirtStreamCreate(infinirtStream_t *pStream, DeviceType device, uint32_t deviceId);
__C __export infinirtStatus_t infinirtStreamDestroy(infinirtStream_t stream);
__C __export infinirtStatus_t infinirtStreamSynchronize(infinirtStream_t stream);
__C __export infinirtStatus_t infinirtGetRawStream(void** ptr, infinirtStream_t stream);
__C __export infinirtStatus_t infinirtGetStreamDeviceInfo(DeviceType* deviceType, uint32_t *deviceId, infinirtStream_t stream);

// Event
struct infinirtEvent;
typedef struct infinirtEvent *infinirtEvent_t;
__C __export infinirtStatus_t infinirtEventCreate(infinirtEvent_t *pEvent, DeviceType device, uint32_t deviceId);
__C __export infinirtStatus_t infinirtEventRecord(infinirtEvent_t event, infinirtStream_t stream);
__C __export infinirtStatus_t infinirtEventQuery(infinirtEvent_t event);
__C __export infinirtStatus_t infinirtEventSynchronize(infinirtEvent_t event);
__C __export infinirtStatus_t infinirtEventDestroy(infinirtEvent_t event);
__C __export infinirtStatus_t infinirtStreamWaitEvent(infinirtEvent_t event, infinirtStream_t stream);

// Memory
__C __export infinirtStatus_t infinirtMalloc(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
__C __export infinirtStatus_t infinirtMallocAsync(void **pMemory, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
__C __export infinirtStatus_t infinirtMallocHost(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
__C __export infinirtStatus_t infinirtFree(void *ptr, DeviceType device, uint32_t deviceId);
__C __export infinirtStatus_t infinirtFreeAsync(void *ptr, DeviceType device, uint32_t deviceId, infinirtStream_t stream);
__C __export infinirtStatus_t infinirtFreeHost(void *ptr, DeviceType device, uint32_t deviceId);
__C __export infinirtStatus_t infinirtMemcpyH2D(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size);
__C __export infinirtStatus_t infinirtMemcpyH2DAsync(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size, infinirtStream_t stream);
__C __export infinirtStatus_t infinirtMemcpyD2H(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
__C __export infinirtStatus_t infinirtMemcpy(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
__C __export infinirtStatus_t infinirtMemcpyAsync(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
#endif
1 change: 1 addition & 0 deletions operatorspy/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ class DeviceEnum:
DEVICE_CUDA = 1
DEVICE_BANG = 2
DEVICE_ASCEND = 3
DEVICE_TECO = 4
1 change: 1 addition & 0 deletions operatorspy/liboperators.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def find_library_in_ld_path(library_name):
paths = ld_library_path.split(os.pathsep)
for path in paths:
full_path = os.path.join(path, library_name)
print(full_path)
if os.path.isfile(full_path):
return full_path
return None
Expand Down
48 changes: 43 additions & 5 deletions operatorspy/tests/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,15 @@ def test(

ans = matmul(c, beta, a, b, alpha)


if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None:
c = rearrange_tensor(c, c_stride)

ans = matmul(c, beta, a, b, alpha)

a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib)
Expand All @@ -99,7 +101,7 @@ def test(
beta
)
)

print(a.stride(),b.stride(),c.stride())
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
Expand All @@ -117,8 +119,7 @@ def test(
None,
)
)

assert torch.allclose(c, ans, atol=0, rtol=1e-2)
assert torch.allclose(c, ans, atol=0, rtol=1e-3)

if PROFILE:
for i in range(NUM_PRERUN):
Expand Down Expand Up @@ -157,6 +158,7 @@ def test(
print(f" lib time: {elapsed :6f}")

check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
print("Test passed!")


def test_cpu(lib, test_cases):
Expand Down Expand Up @@ -292,6 +294,40 @@ def test_ascend(lib, test_cases):

destroy_handle(lib, handle)

def test_sdaa(lib, test_cases):
import torch_sdaa

device = DeviceEnum.DEVICE_TECO
handle = create_handle(lib, device)

for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"sdaa",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)

destroy_handle(lib, handle)

if __name__ == "__main__":
test_cases = [
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
Expand Down Expand Up @@ -352,4 +388,6 @@ def test_ascend(lib, test_cases):
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
if args.teco:
test_sdaa(lib,test_cases)
print("Test passed!")
33 changes: 33 additions & 0 deletions operatorspy/tests/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,37 @@ def test_bang(lib, test_cases):

destroy_handle(lib, handle)

def test_sdaa(lib, test_cases):
import torch_sdaa

device = DeviceEnum.DEVICE_TECO
handle = create_handle(lib, device)

for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
) in test_cases:
test(
lib,
handle,
"sdaa",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
)

destroy_handle(lib, handle)

if __name__ == "__main__":
test_cases = [
Expand Down Expand Up @@ -307,4 +338,6 @@ def test_bang(lib, test_cases):
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
if args.teco:
test_sdaa(lib,test_cases)
print("Test passed!")
36 changes: 17 additions & 19 deletions operatorspy/tests/random_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
else:
end = topk



sum_s = 0
for i in range(end):
sum_s += dataNp[i]
Expand All @@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):

def random_sample_0(data):
return torch.argmax(data)

def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
print(
f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
)

data = torch.rand((voc), dtype=x_dtype).to(torch_device)
data = torch.arange(voc).float() * 0.0001
_perm = torch.randperm(voc)
data = data[_perm].to(x_dtype).to(torch_device)
if(topp > 0 and topk > 1):
ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
else:
Expand Down Expand Up @@ -130,12 +130,9 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
if torch_device == "npu":
torch.npu.synchronize()

assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error"



assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))

print("Test passed!")

def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
Expand Down Expand Up @@ -176,15 +173,16 @@ def test_ascend(lib, test_cases):
if __name__ == "__main__":
test_cases = [
# voc, random_val, topp, topk, temperature
(512, 0.92, 0.8, 3, 0.5),
(4096, 0.95, 0.9, 5, 1.0),
(16384, 0.85, 0.85, 10, 2.0),
(512, 0.92, 0, 3, 0.5),
(4096, 0.95, 0.9, 1, 1.0),
(16384, 0.85, 0, 1, 2.0),
(16384, 0.85, 0, 1, 2.0),
(32000, 0.8, 0.8, 50, 1.0),
(32000, 0.8, 1.0, 25, 1.0),
(512, 0.8, 0.8, 3, 0.5),
(4096, 0.05, 0.9, 5, 1.0),
(16384, 0.15, 0.85, 10, 2.0),
(512, 0.08, 0, 3, 0.5),
(4096, 0.5, 0.9, 1, 1.0),
(16384, 0.15, 0, 1, 2.0),
(16384, 0.15, 0, 1, 2.0),
(32000, 0.08, 0.8, 50, 1.0),
(32000, 0.08, 1.0, 25, 1.0),
# (119696, 0.01, 1.0, 100, 1.0),
]

args = get_args()
Expand Down Expand Up @@ -228,4 +226,4 @@ def test_ascend(lib, test_cases):
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("Test passed!")
print("\033[92mTest passed!\033[0m")
13 changes: 13 additions & 0 deletions operatorspy/tests/rearrange.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ def test_ascend(lib, test_cases):
test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)

def test_teco(lib, test_cases):
import torch_sdaa

device = DeviceEnum.DEVICE_TECO
handle = create_handle(lib, device)
for test_case in test_cases:
x_shape, x_stride = test_case[0]
y_shape, y_stride = test_case[1]
test(lib, handle, "sdaa", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)

if __name__ == "__main__":
args = get_args()
test_cases = [
Expand Down Expand Up @@ -140,3 +151,5 @@ def test_ascend(lib, test_cases):
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if args.teco:
test_teco(lib, test_cases)
11 changes: 10 additions & 1 deletion operatorspy/tests/rms_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
None,
)
)

assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
print("Test passed!")
Expand All @@ -104,6 +103,14 @@ def test_bang(lib, test_cases):
test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)

def test_sdaa(lib, test_cases):
import torch_sdaa
device = DeviceEnum.DEVICE_TECO
handle = create_handle(lib, device)
for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
test(lib, handle, "sdaa", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)

def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
Expand Down Expand Up @@ -158,6 +165,8 @@ def test_ascend(lib, test_cases):
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.teco:
test_sdaa(lib,test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
Expand Down
Loading