Skip to content

Commit 1679773

Browse files
committed
UCT: Fixed the last code style issues, removed topo and rebuild.
1 parent ba8923d commit 1679773

File tree

10 files changed

+179
-2046
lines changed

10 files changed

+179
-2046
lines changed

configure.ac

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ AS_IF([test "x$with_docs_only" = xyes],
232232
AM_CONDITIONAL([HAVE_LCOV], [false])
233233
AM_CONDITIONAL([HAVE_ZE], [false])
234234
AM_CONDITIONAL([HAVE_GAUDI], [false])
235-
AM_CONDITIONAL([ENABLE_GAUDI_TOPO_API], [false])
236235
],
237236
[
238237
AM_CONDITIONAL([DOCS_ONLY], [false])

src/uct/gaudi/Makefile.am

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@ libuct_gaudi_la_LDFLAGS = $(GAUDI_LDFLAGS) $(GAUDI_LIBS) -version-info $(SOVERS
1616

1717
noinst_HEADERS = \
1818
base/scal.h \
19-
base/gaudi_base.h \
20-
base/gaudi_topo.h
19+
base/gaudi_base.h
2120

2221
EXTRA_DIST =
2322

2423
libuct_gaudi_la_SOURCES = \
25-
base/gaudi_base.c \
26-
base/gaudi_topo.c
24+
base/gaudi_base.c
2725

2826
noinst_HEADERS += \
2927
gaudi_gdr/gaudi_gdr_md.h \

src/uct/gaudi/base/gaudi_base.c

Lines changed: 145 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,22 @@
88
#endif
99

1010
#include "gaudi_base.h"
11-
#include <uct/gaudi/gaudi_gdr/gaudi_gdr_md.h>
11+
12+
#include <ucs/arch/atomic.h>
13+
#include <ucs/sys/sys.h>
14+
#include <ucs/sys/string.h>
1215
#include <ucs/sys/module.h>
1316
#include <ucs/memory/numa.h>
14-
#include <ucs/sys/sys.h>
1517
#include <ucs/sys/topo/base/topo.h>
16-
#include <pthread.h>
18+
#include <uct/gaudi/gaudi_gdr/gaudi_gdr_md.h>
1719

1820
#include <inttypes.h>
1921
#include <fcntl.h>
22+
#include <pthread.h>
2023
#include <hlthunk.h>
2124
#include <synapse_api.h>
2225

26+
2327
int uct_gaudi_base_get_fd(int device_id, bool *fd_created)
2428
{
2529
synDeviceInfo deviceInfo;
@@ -39,6 +43,9 @@ int uct_gaudi_base_get_fd(int device_id, bool *fd_created)
3943
return fd;
4044
}
4145

46+
if (fd_created != NULL) {
47+
*fd_created = false;
48+
}
4249
return deviceInfo.fd;
4350
}
4451

@@ -56,7 +63,7 @@ void uct_gaudi_base_close_dmabuf_fd(int fd)
5663
}
5764
}
5865

59-
ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t* sys_dev)
66+
ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t *sys_dev)
6067
{
6168
ucs_status_t status;
6269
char pci_bus_id[13];
@@ -152,3 +159,137 @@ uct_gaudi_base_query_devices(uct_md_h md,
152159
UCT_DEVICE_TYPE_ACC, sys_dev,
153160
tl_devices_p, num_tl_devices_p);
154161
}
162+
163+
static void
164+
uct_gaudi_base_configure_sys_device_from_fd(int fd, int index,
165+
ucs_sys_device_t *sys_dev_p)
166+
{
167+
ucs_status_t status;
168+
struct hlthunk_hw_ip_info hw_ip;
169+
const unsigned sys_device_priority = 10;
170+
char device_name[16];
171+
int rc;
172+
173+
ucs_assert(fd >= 0);
174+
175+
status = uct_gaudi_base_get_sysdev(fd, sys_dev_p);
176+
if (status != UCS_OK) {
177+
goto err;
178+
}
179+
180+
memset(&hw_ip, 0, sizeof(hw_ip));
181+
rc = hlthunk_get_hw_ip_info(fd, &hw_ip);
182+
if (rc) {
183+
ucs_error("failed to get hw_ip info for fd %d (rc=%d)", fd, rc);
184+
goto err;
185+
}
186+
187+
status = ucs_topo_sys_device_set_user_value(*sys_dev_p, hw_ip.module_id);
188+
if (status != UCS_OK) {
189+
ucs_error("failed to set user value %u for sys_dev %d", hw_ip.module_id,
190+
*sys_dev_p);
191+
goto err;
192+
}
193+
194+
ucs_snprintf_safe(device_name, sizeof(device_name), "GAUDI_%d", index);
195+
status = ucs_topo_sys_device_set_name(*sys_dev_p, device_name,
196+
sys_device_priority);
197+
if (status != UCS_OK) {
198+
ucs_warn("failed to set name for index %d: %s", index,
199+
ucs_status_string(status));
200+
}
201+
202+
status = ucs_topo_sys_device_enable_aux_path(*sys_dev_p);
203+
if (status != UCS_OK) {
204+
ucs_debug("no aux path for %s: %s", device_name,
205+
ucs_status_string(status));
206+
}
207+
208+
ucs_debug("registered %s (sys_dev %d)", device_name, *sys_dev_p);
209+
210+
return;
211+
212+
err:
213+
*sys_dev_p = UCS_SYS_DEVICE_ID_UNKNOWN;
214+
}
215+
216+
static int uct_gaudi_base_open_minor(int id)
217+
{
218+
char buf[64];
219+
int fd;
220+
ucs_snprintf_safe(buf, sizeof(buf), HLTHUNK_DEV_NAME_CONTROL, id);
221+
fd = open(buf, O_RDWR | O_CLOEXEC, 0);
222+
return (fd >= 0) ? fd : -errno;
223+
}
224+
225+
/* device discovery - enumerate all gaudi devices and register with topology */
226+
ucs_status_t uct_gaudi_base_discover_devices(void)
227+
{
228+
static pthread_mutex_t discovery_mutex = PTHREAD_MUTEX_INITIALIZER;
229+
static uint32_t discovery_done = 0;
230+
231+
ucs_status_t status = UCS_OK;
232+
ucs_sys_device_t sys_dev;
233+
234+
int device_count = 0;
235+
int discovered_devices = 0;
236+
int i, fd;
237+
238+
/* check if already discovered - use atomic load for memory ordering */
239+
if (ucs_atomic_fadd32(&discovery_done, 0)) {
240+
return UCS_OK;
241+
}
242+
243+
pthread_mutex_lock(&discovery_mutex);
244+
245+
/* double-check after acquiring lock */
246+
if (ucs_atomic_fadd32(&discovery_done, 0)) {
247+
goto out;
248+
}
249+
250+
ucs_debug("starting gaudi device discovery");
251+
252+
/* we do not know what minor is in use, so try them all. */
253+
for (i = 0; i < HLTHUNK_MAX_MINOR; i++) {
254+
/* open the control device instead of the actual hardware device, */
255+
/* because the real device node may be busy or in use by another process. */
256+
fd = uct_gaudi_base_open_minor(i);
257+
if (fd < 0) {
258+
continue;
259+
}
260+
261+
uct_gaudi_base_configure_sys_device_from_fd(fd, discovered_devices,
262+
&sys_dev);
263+
close(fd);
264+
265+
if (sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN) {
266+
discovered_devices++;
267+
}
268+
}
269+
270+
/* extra measure: compare with reported count */
271+
device_count = hlthunk_get_device_count(HLTHUNK_DEVICE_DONT_CARE);
272+
if (device_count >= 0 && device_count != discovered_devices) {
273+
ucs_warn("gaudi discovery mismatch: discovered=%d, driver=%d",
274+
discovered_devices, device_count);
275+
}
276+
277+
if (discovered_devices > 0) {
278+
ucs_debug("discovered %d gaudi devices", discovered_devices);
279+
status = UCS_OK;
280+
281+
ucs_atomic_add32(&discovery_done, 1);
282+
} else {
283+
ucs_debug("no gaudi devices found");
284+
status = UCS_ERR_NO_DEVICE;
285+
}
286+
287+
out:
288+
pthread_mutex_unlock(&discovery_mutex);
289+
return status;
290+
}
291+
292+
UCS_MODULE_INIT()
293+
{
294+
return UCS_OK;
295+
}

src/uct/gaudi/base/gaudi_base.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,21 @@
66
#ifndef UCT_GAUDI_BASE_H_
77
#define UCT_GAUDI_BASE_H_
88

9-
#include <stdbool.h>
9+
#include "scal.h"
10+
1011
#include <uct/base/uct_iface.h>
1112
#include <uct/base/uct_md.h>
12-
#include "scal.h"
13+
14+
#include <stdbool.h>
15+
1316

1417
int uct_gaudi_base_get_fd(int device_id, bool *fd_created);
1518

1619
void uct_gaudi_base_close_fd(int fd, bool fd_created);
1720

1821
void uct_gaudi_base_close_dmabuf_fd(int fd);
1922

20-
ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t* sys_dev);
23+
ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t *sys_dev);
2124

2225
ucs_status_t uct_gaudi_base_get_info(int fd,
2326
uint64_t *device_base_allocated_address,
@@ -28,4 +31,6 @@ ucs_status_t
2831
uct_gaudi_base_query_devices(uct_md_h md,
2932
uct_tl_device_resource_t **tl_devices_p,
3033
unsigned *num_tl_devices_p);
34+
35+
ucs_status_t uct_gaudi_base_discover_devices(void);
3136
#endif

0 commit comments

Comments
 (0)