88#endif
99
1010#include "gaudi_base.h"
11- #include <uct/gaudi/gaudi_gdr/gaudi_gdr_md.h>
11+
12+ #include <ucs/arch/atomic.h>
13+ #include <ucs/sys/sys.h>
14+ #include <ucs/sys/string.h>
1215#include <ucs/sys/module.h>
1316#include <ucs/memory/numa.h>
14- #include <ucs/sys/sys.h>
1517#include <ucs/sys/topo/base/topo.h>
16- #include <pthread .h>
18+ #include <uct/gaudi/gaudi_gdr/gaudi_gdr_md .h>
1719
1820#include <inttypes.h>
1921#include <fcntl.h>
22+ #include <pthread.h>
2023#include <hlthunk.h>
2124#include <synapse_api.h>
2225
26+
2327int uct_gaudi_base_get_fd (int device_id , bool * fd_created )
2428{
2529 synDeviceInfo deviceInfo ;
@@ -39,6 +43,9 @@ int uct_gaudi_base_get_fd(int device_id, bool *fd_created)
3943 return fd ;
4044 }
4145
46+ if (fd_created != NULL ) {
47+ * fd_created = false;
48+ }
4249 return deviceInfo .fd ;
4350}
4451
@@ -56,7 +63,7 @@ void uct_gaudi_base_close_dmabuf_fd(int fd)
5663 }
5764}
5865
59- ucs_status_t uct_gaudi_base_get_sysdev (int fd , ucs_sys_device_t * sys_dev )
66+ ucs_status_t uct_gaudi_base_get_sysdev (int fd , ucs_sys_device_t * sys_dev )
6067{
6168 ucs_status_t status ;
6269 char pci_bus_id [13 ];
@@ -152,3 +159,137 @@ uct_gaudi_base_query_devices(uct_md_h md,
152159 UCT_DEVICE_TYPE_ACC , sys_dev ,
153160 tl_devices_p , num_tl_devices_p );
154161}
162+
163+ static void
164+ uct_gaudi_base_configure_sys_device_from_fd (int fd , int index ,
165+ ucs_sys_device_t * sys_dev_p )
166+ {
167+ ucs_status_t status ;
168+ struct hlthunk_hw_ip_info hw_ip ;
169+ const unsigned sys_device_priority = 10 ;
170+ char device_name [16 ];
171+ int rc ;
172+
173+ ucs_assert (fd >= 0 );
174+
175+ status = uct_gaudi_base_get_sysdev (fd , sys_dev_p );
176+ if (status != UCS_OK ) {
177+ goto err ;
178+ }
179+
180+ memset (& hw_ip , 0 , sizeof (hw_ip ));
181+ rc = hlthunk_get_hw_ip_info (fd , & hw_ip );
182+ if (rc ) {
183+ ucs_error ("failed to get hw_ip info for fd %d (rc=%d)" , fd , rc );
184+ goto err ;
185+ }
186+
187+ status = ucs_topo_sys_device_set_user_value (* sys_dev_p , hw_ip .module_id );
188+ if (status != UCS_OK ) {
189+ ucs_error ("failed to set user value %u for sys_dev %d" , hw_ip .module_id ,
190+ * sys_dev_p );
191+ goto err ;
192+ }
193+
194+ ucs_snprintf_safe (device_name , sizeof (device_name ), "GAUDI_%d" , index );
195+ status = ucs_topo_sys_device_set_name (* sys_dev_p , device_name ,
196+ sys_device_priority );
197+ if (status != UCS_OK ) {
198+ ucs_warn ("failed to set name for index %d: %s" , index ,
199+ ucs_status_string (status ));
200+ }
201+
202+ status = ucs_topo_sys_device_enable_aux_path (* sys_dev_p );
203+ if (status != UCS_OK ) {
204+ ucs_debug ("no aux path for %s: %s" , device_name ,
205+ ucs_status_string (status ));
206+ }
207+
208+ ucs_debug ("registered %s (sys_dev %d)" , device_name , * sys_dev_p );
209+
210+ return ;
211+
212+ err :
213+ * sys_dev_p = UCS_SYS_DEVICE_ID_UNKNOWN ;
214+ }
215+
216+ static int uct_gaudi_base_open_minor (int id )
217+ {
218+ char buf [64 ];
219+ int fd ;
220+ ucs_snprintf_safe (buf , sizeof (buf ), HLTHUNK_DEV_NAME_CONTROL , id );
221+ fd = open (buf , O_RDWR | O_CLOEXEC , 0 );
222+ return (fd >= 0 ) ? fd : - errno ;
223+ }
224+
225+ /* device discovery - enumerate all gaudi devices and register with topology */
226+ ucs_status_t uct_gaudi_base_discover_devices (void )
227+ {
228+ static pthread_mutex_t discovery_mutex = PTHREAD_MUTEX_INITIALIZER ;
229+ static uint32_t discovery_done = 0 ;
230+
231+ ucs_status_t status = UCS_OK ;
232+ ucs_sys_device_t sys_dev ;
233+
234+ int device_count = 0 ;
235+ int discovered_devices = 0 ;
236+ int i , fd ;
237+
238+ /* check if already discovered - use atomic load for memory ordering */
239+ if (ucs_atomic_fadd32 (& discovery_done , 0 )) {
240+ return UCS_OK ;
241+ }
242+
243+ pthread_mutex_lock (& discovery_mutex );
244+
245+ /* double-check after acquiring lock */
246+ if (ucs_atomic_fadd32 (& discovery_done , 0 )) {
247+ goto out ;
248+ }
249+
250+ ucs_debug ("starting gaudi device discovery" );
251+
252+ /* we do not know what minor is in use, so try them all. */
253+ for (i = 0 ; i < HLTHUNK_MAX_MINOR ; i ++ ) {
254+ /* open the control device instead of the actual hardware device, */
255+ /* because the real device node may be busy or in use by another process. */
256+ fd = uct_gaudi_base_open_minor (i );
257+ if (fd < 0 ) {
258+ continue ;
259+ }
260+
261+ uct_gaudi_base_configure_sys_device_from_fd (fd , discovered_devices ,
262+ & sys_dev );
263+ close (fd );
264+
265+ if (sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN ) {
266+ discovered_devices ++ ;
267+ }
268+ }
269+
270+ /* extra measure: compare with reported count */
271+ device_count = hlthunk_get_device_count (HLTHUNK_DEVICE_DONT_CARE );
272+ if (device_count >= 0 && device_count != discovered_devices ) {
273+ ucs_warn ("gaudi discovery mismatch: discovered=%d, driver=%d" ,
274+ discovered_devices , device_count );
275+ }
276+
277+ if (discovered_devices > 0 ) {
278+ ucs_debug ("discovered %d gaudi devices" , discovered_devices );
279+ status = UCS_OK ;
280+
281+ ucs_atomic_add32 (& discovery_done , 1 );
282+ } else {
283+ ucs_debug ("no gaudi devices found" );
284+ status = UCS_ERR_NO_DEVICE ;
285+ }
286+
287+ out :
288+ pthread_mutex_unlock (& discovery_mutex );
289+ return status ;
290+ }
291+
292+ UCS_MODULE_INIT ()
293+ {
294+ return UCS_OK ;
295+ }
0 commit comments