diff --git a/debian/rdma-core.install b/debian/rdma-core.install index efed75905..c671c2293 100644 --- a/debian/rdma-core.install +++ b/debian/rdma-core.install @@ -22,6 +22,7 @@ lib/udev/rules.d/90-rdma-ulp-modules.rules lib/udev/rules.d/90-rdma-umad.rules usr/lib/truescale-serdes.cmds usr/sbin/iwpmd +usr/sbin/rdma_topo usr/sbin/rdma-ndd usr/share/doc/rdma-core/70-persistent-ipoib.rules usr/share/doc/rdma-core/MAINTAINERS diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt index 83d0a274f..9bf7540eb 100644 --- a/kernel-boot/CMakeLists.txt +++ b/kernel-boot/CMakeLists.txt @@ -41,6 +41,9 @@ rdma_subst_install(FILES "persistent-ipoib.rules.in" DESTINATION "${CMAKE_INSTALL_DOCDIR}" PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) +install(FILES "rdma_topo" + DESTINATION "${CMAKE_INSTALL_SBINDIR}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") # Create an installed executable (under /usr/lib/udev) diff --git a/kernel-boot/rdma_topo b/kernel-boot/rdma_topo new file mode 100755 index 000000000..eba238b42 --- /dev/null +++ b/kernel-boot/rdma_topo @@ -0,0 +1,769 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Linux-OpenIB +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES +# PYTHON_ARGCOMPLETE_OK +from __future__ import annotations +import argparse +import collections +import importlib +import inspect +import itertools +import os +import re +import subprocess +import sys +import tempfile + +from typing import * + +BDF_RE = re.compile(r"^([0-9a-f]+?):([0-9a-f]{2}?):([0-9a-f]{2}?)\.([0-9a-f])$") +KERNEL_ACS_ISOLATED = "xx111x1" +pci_vendors = { + "MELLANOX": 0x15B3, + "NVIDIA": 0x10DE, +} + + +class CommandError(Exception): + pass + + +def sysfs_read_str(sysdir: str, fn: str) -> str: + """Read the entire content of a sysfs file to a string""" + with open(os.path.join(sysdir, fn)) as F: + return F.read().strip() + + +def sysfs_read_link(sysdir: str, fn: str) -> str: + """Read a link in sysfs to an absolute path string""" + return os.readlink(os.path.join(sysdir, fn)) + + +def PCI_VDEVICE(vendor: str, device_id: int) -> re.Pattern: + """Match a Vendor and device ID""" + vendor_id = pci_vendors[vendor] + return re.compile(rf"^pci:v{vendor_id:08X}d{device_id:08X}.*$") + + +def PCI_DEVICE_CLASS(cid: int) -> re.Pattern: + """Match by exact programming class using the int coding from the kernel""" + class_id = (cid >> 16) & 0xFF + subclass_id = (cid >> 8) & 0xFF + progif = cid & 0xFF + return re.compile(rf"^pci:.*bc{class_id:02X}sc{subclass_id:02X}i{progif:02X}.*$") + + +def PCI_NVGPU() -> re.Pattern: + """Match all NVIDIA GPUs""" + vendor_id = pci_vendors["NVIDIA"] + class_id = 0x03 + return re.compile(rf"^pci:v{vendor_id:08X}.*bc{class_id:02X}.*$") + + +# Table of modalias matches to the device_type string +pci_device_types = { + PCI_VDEVICE("NVIDIA", 0x22B1): "grace_rp", # NVIDIA Grace PCI Root Port Bridge + PCI_VDEVICE("NVIDIA", 0x22B2): "grace_rp", # NVIDIA Grace PCI Root Port Bridge + PCI_VDEVICE("NVIDIA", 0x22B8): "grace_rp", # NVIDIA Grace PCI Root Port Bridge + PCI_VDEVICE("MELLANOX", 0x1021): "cx_nic", # ConnectX-7 + PCI_VDEVICE("MELLANOX", 0x1023): "cx_nic", # ConnectX-8 + PCI_VDEVICE("MELLANOX", 0xA2DC): "bf3_nic", # BlueField-3 + PCI_VDEVICE("MELLANOX", 0x2100): "cx_dma", # ConnectX-8 DMA Controller + PCI_VDEVICE("MELLANOX", 0x197B): "bf3_switch", # USP/DSP of a BF3 switch + PCI_VDEVICE("MELLANOX", 0x197C): "cx_switch", # USP/DSP of a CX switch + PCI_DEVICE_CLASS(0x010802): "nvme", + PCI_NVGPU(): "nvgpu", +} + + +class PCIBDF( + collections.namedtuple("PCIBDF", ["segment", "bus", "device", "function"]) +): + """Bus Device Function for a PCI device""" + + def as_pci(self): + return f"{self.segment}:{self.bus}:{self.device}.{self.function}" + + def __str__(self): + return self.as_pci() + + def __repr__(self): + return f"PCIBDF({self.segment}, {self.bus}, {self.device}, {self.function})" + + +def to_pcibdf(s: str) -> Optional[PCIBDF]: + g = BDF_RE.match(s) + if not g: + return None + return PCIBDF(*g.groups()) + + +class PCIDevice(object): + device_type = "" + vpd_v3: str = None + parent: PCIDevice = None + lspci_data: str = None + + def __init__(self, sysdir: str, bdf: PCIBDF): + self.sysdir = sysdir + self.bdf = bdf + try: + self.iommu_group = int( + os.path.split(sysfs_read_link(sysdir, "iommu_group"))[-1] + ) + except FileNotFoundError: + self.iommu_group = None + + try: + self.numa_node = int(sysfs_read_str(sysdir, "numa_node")) + except FileNotFoundError: + self.numa_node = None + + self.modalias = sysfs_read_str(sysdir, "modalias") + for k, v in pci_device_types.items(): + if k.match(self.modalias): + self.device_type = v + break + + sysdir = os.path.realpath(sysdir) + parent = os.path.basename(os.path.dirname(sysdir)) + self.parent_bdf = to_pcibdf(parent) + self.children: Set[PCIDevice] = set() + + def finish_loading(self): + """Do more expensive parsing operations""" + if self.device_type == "cx_nic" or self.device_type == "cx_dma": + self.vpd_v3 = self.parse_vpd_v3() + if "switch" in self.device_type or self.device_type == "grace_rp": + self.has_acs = self.parse_has_acs() + + def iterdownstream(self) -> Generator[PCIDevice, None, None]: + """Iterate over all downstream devices of this device recursively""" + for pdev in self.children: + yield pdev + yield from pdev.iterdownstream() + + def iterfulltree(self): + for pdev in self.iterupstream_path(): + if not pdev.parent: + yield from pdev.iterdownstream() + + def iterupstream_path(self): + """Iterate over each step along the upstream path from the devices + parent to the root.""" + pdev = self.parent + while pdev: + yield pdev + pdev = pdev.parent + + def __repr__(self): + return f"PCIDevice({self.bdf})" + + def lspci(self): + """Fetch the verbose output of lspci""" + vpdfn = os.path.join(self.sysdir, "vpd") + if os.path.exists(vpdfn) and not os.access(vpdfn, os.R_OK): + raise CommandError( + f"Need access to the PCI VPD information in {vpdfn}, are you root?" + ) + + if not self.lspci_data: + self.lspci_data = subprocess.check_output( + ["lspci", "-s", f"{self.bdf.as_pci()}", "-vv"] + ).decode() + return self.lspci_data + + def parse_vpd_v3(self): + """Use lspci to parse the VPD and get the V3 UUID, this only works as + root on non-secure boot systems.""" + g = re.search( + r"Capabilities: \[.+?\] Vital Product Data$.*Read-only fields:$.*\[V3\] Vendor specific: (.*?)$.*End$", + self.lspci(), + re.DOTALL | re.MULTILINE, + ) + if not g: + return None + return g.group(1) + + def parse_has_acs(self): + """True if the device has an ACS capability""" + return bool( + re.search( + r"Capabilities: \[.+?\] Access Control Services$", + self.lspci(), + re.DOTALL | re.MULTILINE, + ) + ) + + def parse_vpd_name(self): + g = re.search( + r"Capabilities: \[.+?\] Vital Product Data$.*Product Name: (.*?)$.*End$", + self.lspci(), + re.DOTALL | re.MULTILINE, + ) + if not g: + return None + return g.group(1).strip() + + def read_config(self, regname: str): + """Use setpci to read a register""" + return int( + subprocess.check_output( + ["setpci", "-r", "-s", str(self.bdf), "ECAP_ACS+0x6.w"] + ) + .decode() + .strip(), + 16, + ) + + def get_subsystems(self): + """Return a list of subsystem the PCI device is connected to""" + res: Dict[str, Set[str]] = collections.defaultdict(set) + for fn in os.listdir(self.sysdir): + if fn in {"drm", "infiniband", "net", "nvme"}: + res[fn].update(os.listdir(os.path.join(self.sysdir, fn))) + return res + + +class NVCX_Complex(object): + """Hold the related PCI functions together. A complex includes a CX PF, a CX + DMA function, an GPU and related PCI switches in the DMA function + segment.""" + + def __init__(self, cx_pfs: Set[PCIDevice], cx_dma: PCIDevice, nvgpu: PCIDevice): + self.cx_pfs = cx_pfs - {cx_dma} + self.cx_pf = sorted(self.cx_pfs, key=lambda x: x.bdf)[0] + self.cx_dma = cx_dma + self.nvgpu = nvgpu + + # Identify the switch ports that are part of the shared path that + # handles the P2P traffic + self.shared_usp = self.__find_shared_usp() + for pdev in self.cx_dma.iterupstream_path(): + if pdev in self.shared_usp.children: + self.cx_dma_dsp = pdev + for pdev in self.nvgpu.iterupstream_path(): + if pdev in self.shared_usp.children: + self.nvgpu_dsp = pdev + + # There can be a NVMe device connected to the CX NIC as well. For NVMe + # it is best to match with GPUs on the same socket, so a NUMA aware + # approach would be fine, but also the GPU/NIC/NVMe could be + # consistently paired based on the physical layout. + self.nvmes: Set[PCIDevice] = set() + for pdev in self.cx_pf.iterfulltree(): + if pdev.device_type == "nvme": + self.nvmes.add(pdev) + + def __find_shared_usp(self) -> PCIDevice: + """Find the USP that is shared by both devices, the immediate downstream + bus is the point in the topology where P2P traffic will switch from an + upstream to downstream direction.""" + common_path = set(self.cx_dma.iterupstream_path()).intersection( + set(self.nvgpu.iterupstream_path()) + ) + assert common_path + + for pdev in self.cx_dma.iterupstream_path(): + if pdev in common_path: + assert pdev.device_type == "cx_switch" + for i in pdev.children: + assert i.device_type == "cx_switch" + return pdev + + def get_subsystems(self): + subsystems: Dict[str, Set[str]] = collections.defaultdict(set) + for pdev in itertools.chain(self.cx_pfs, [self.nvgpu, self.cx_dma], self.nvmes): + for k, v in pdev.get_subsystems().items(): + subsystems[k].update(v) + return subsystems + + +def check_parent(pdev: PCIDevice, parent_type: str): + if not pdev or not pdev.parent: + return None + if pdev.parent.device_type != parent_type: + return None + return pdev.parent + + +class PCITopo(object): + """Load the PCI topology from sysfs and organize it""" + + def __init__(self): + self.devices = self.__load_devices("/sys/bus/pci/devices/") + self.has_cx_dma = any( + pdev.device_type == "cx_dma" for pdev in self.devices.values() + ) + if self.has_cx_dma: + for pdev in self.devices.values(): + pdev.finish_loading() + self.__build_topo() + + def __load_devices(self, sysdir: str): + res: Dict[PCIBDF, PCIDevice] = {} + for fn in os.listdir(sysdir): + bdf = to_pcibdf(fn) + if not bdf: + continue + assert bdf not in res + res[bdf] = PCIDevice(os.path.join(sysdir, fn), bdf) + return res + + def __get_nvcx_complex(self, cx_dma: PCIDevice): + """Match the topology for the switch complex using a CX DMA function and a + single GPU. It has two nested switches: + + RP --> SW -> CX_DMA + -> SW -> GPU + """ + assert cx_dma.device_type == "cx_dma" + if not cx_dma.vpd_v3: + raise ValueError(f"CX DMA function {cx_dma} does not have a VPD V3 UUID") + + # The DMA and PF are matched using the UUID from the VPD + cx_pfs = self.vpd_v3s.get(cx_dma.vpd_v3) + if cx_pfs is None: + raise ValueError( + f"CX DMA function {cx_dma} does not have a matching PF, V3 UUID matching failed" + ) + return None + + # Path from the DMA to the root port + cx_dma_dsp = check_parent(cx_dma, "cx_switch") + cx_usp = check_parent(cx_dma_dsp, "cx_switch") + grace_rp = check_parent(cx_usp, "grace_rp") + if not grace_rp: + raise ValueError( + f"CX DMA function {cx_dma} has an unrecognized upstream path" + ) + + # Path from the GPU to the root port + nvgpus = [ + pdev for pdev in grace_rp.iterdownstream() if pdev.device_type == "nvgpu" + ] + if len(nvgpus) != 1: + raise ValueError(f"CX DMA function {cx_dma} does not have a nearby GPU") + nvgpu = nvgpus[0] + nvgpu_dsp2 = check_parent(nvgpu, "cx_switch") + nvgpu_usp2 = check_parent(nvgpu_dsp2, "cx_switch") + nvgpu_dsp1 = check_parent(nvgpu_usp2, "cx_switch") + if cx_usp != check_parent(nvgpu_dsp1, "cx_switch"): + raise ValueError( + f"CX DMA function {cx_dma} has an unrecognized upstream path from the GPU" + ) + + # Sanity check there is nothing unexpected in the topology + alldevs = { + cx_dma, + cx_dma_dsp, + cx_usp, + nvgpu, + nvgpu_dsp2, + nvgpu_usp2, + nvgpu_dsp1, + } + topodevs = set(grace_rp.iterdownstream()) + if alldevs != topodevs: + raise ValueError( + f"CX DMA function {cx_dma} has unexpected PCI devices in the topology" + ) + return NVCX_Complex(cx_pfs, cx_dma, nvgpu) + + def __build_topo(self): + """Collect cross-device information together and build the NVCX_Complex + objects for the cx_dma functions""" + self.vpd_v3s: Dict[str, Set[PCIDevice]] = collections.defaultdict(set) + for pdev in self.devices.values(): + if pdev.parent_bdf: + pdev.parent = self.devices[pdev.parent_bdf] + pdev.parent.children.add(pdev) + + # Many PCI functions may share the same V3 + if pdev.vpd_v3: + self.vpd_v3s[pdev.vpd_v3].add(pdev) + + self.nvcxs: List[NVCX_Complex] = [] + for pdev in self.devices.values(): + if pdev.device_type == "cx_dma": + nvcx = self.__get_nvcx_complex(pdev) + self.nvcxs.append(nvcx) + self.nvcxs.sort(key=lambda x: x.cx_pf.bdf) + + def compute_acs(self): + """Return a dictionary of PCI devices and the ACS mask the device should + have""" + acs: Dict[PCIDevice, str] = {} + for nvcx in self.nvcxs: + # For the DSP in the shared switch toward the CX8 DMA Direct interface: + # Enable these bits: + # bit-4 : ACS Upstream Forwarding + # bit-3 : ACS P2P Completion Redirect + # bit-0 : ACS Source Validation + # Disable these bits: + # bit-2 : ACS P2P Request Redirect + assert nvcx.cx_dma_dsp.has_acs + acs[nvcx.cx_dma_dsp] = "xx110x1" + + # For the DSP in the shared switch toward the GPU: + # Enable the following bits: + # bit-4 : ACS Upstream Forwarding + # bit-2 : ACS P2P Request Redirect + # bit-0 : ACS Source Validation + # Disable the following bits: + # bit-3 : ACS P2P Completion Redirect + assert nvcx.nvgpu_dsp.has_acs + acs[nvcx.nvgpu_dsp] = "xx101x1" + + # Disable ACS SV on the root port, this forces the entire segment + # into one iommu_group and avoids kernel bugs building groups for + # irregular ACS. + for pdev in nvcx.cx_dma_dsp.iterupstream_path(): + if not pdev.parent: + assert pdev.has_acs + acs[pdev] = "xx111x0" + + # For all other CX bridges set kernel's default ACS enable + # Enable these bits: + # bit-4 : ACS Upstream Forwarding + # bit-3 : ACS P2P Completion Redirect + # bit-2 : ACS P2P Request Redirect + # bit-0 : ACS Source Validation + # Which match the kernel default + for pdev in self.devices.values(): + if ( + pdev not in acs + and ("switch" in pdev.device_type or "grace_rp" in pdev.device_type) + and pdev.has_acs + ): + acs[pdev] = KERNEL_ACS_ISOLATED + return acs + + +# ------------------------------------------------------------------- +def print_list(title: str, items: list[str]): + if not items: + return + if len(items) > 1: + title = title + "s" + list_str = ", ".join(sorted(items)) + print(f"\t{title}: {list_str}") + + +def args_topology(parser): + parser.add_argument( + "-j", + "--json", + action="store_true", + dest="json", + help="Output in machine readable JSON format", + ) + + +def topo_json(topo: PCITopo): + import json + + jtop = [] + for nvcx in topo.nvcxs: + jnvcx = { + "rdma_nic_pf_bdf": str(nvcx.cx_pf.bdf), + "rdma_dma_bdf": str(nvcx.cx_dma.bdf), + "gpu_bdf": str(nvcx.nvgpu.bdf), + "subsystems": {}, + } + devname = nvcx.cx_pf.parse_vpd_name() + if devname: + jnvcx["rdma_nic_vpd_name"] = nvcx.cx_pf.parse_vpd_name() + if nvcx.cx_pf.numa_node is not None: + jnvcx["numa_node"] = nvcx.cx_pf.numa_node + if nvcx.nvmes: + jnvcx["nvme_bdf"] = str(next(iter(nvcx.nvmes)).bdf) + + for pdev in sorted( + itertools.chain(nvcx.cx_pfs, [nvcx.nvgpu, nvcx.cx_dma], nvcx.nvmes), + key=lambda x: x.bdf, + ): + subsys = pdev.get_subsystems() + if subsys: + jnvcx["subsystems"][str(pdev.bdf)] = { + subsys: list(devs) for subsys, devs in subsys.items() + } + jtop.append(jnvcx) + print(json.dumps(jtop, indent=4)) + + +def cmd_topology(args): + """List the ConnectX NICs in the system with the corresponding NIC + function, DMA Direct function and associated GPU.""" + topo = PCITopo() + if not topo.has_cx_dma: + raise CommandError("No ConnectX DMA Direct functions detected") + + if args.json: + return topo_json(topo) + + for nvcx in topo.nvcxs: + print( + f"RDMA NIC={nvcx.cx_pf.bdf}, GPU={nvcx.nvgpu.bdf}, RDMA DMA Function={nvcx.cx_dma.bdf}" + ) + + devname = nvcx.cx_pf.parse_vpd_name() + if devname: + print(f"\t{devname}") + + if nvcx.cx_pf.numa_node is not None: + print(f"\tNUMA Node: {nvcx.cx_pf.numa_node}") + + if len(nvcx.cx_pfs): + print_list("NIC PCI device", [str(I.bdf) for I in nvcx.cx_pfs]) + + subsystems = nvcx.get_subsystems() + print_list("RDMA device", subsystems["infiniband"]) + print_list("Net device", subsystems["net"]) + print_list("DRM device", subsystems["drm"]) + print_list("NVMe device", subsystems["nvme"]) +cmd_topology.__aliases__ = ("topo",) + +# ------------------------------------------------------------------- +def update_file(fn: str, new_content: str): + """Make fn have new_content. If fn already has new_content nothing is + done.""" + try: + with open(fn, "rt") as F: + old = F.read() + if old == new_content: + return False + except FileNotFoundError: + pass + with tempfile.NamedTemporaryFile(dir=os.path.dirname(fn), mode="wt") as F: + F.write(new_content) + F.flush() + os.chmod(F.name, 0o644) + try: + os.link(F.name, fn) + except FileExistsError: + os.unlink(fn) + os.link(F.name, fn) + return True + + +def args_write_grub_acs(parser): + parser.add_argument( + "-n", + "--dry-run", + action="store_true", + dest="dry_run", + help="Output the grub configuration to stdout and make no changes", + ) + parser.add_argument( + "--output", + action="store", + default="/etc/default/grub.d/config-acs.cfg", + help="Grub dropin file to use for the kernel command line", + ) + + +def cmd_write_grub_acs(args): + """Generate a grub dropin file to have the kernel commandline set the + required ACS flags during system boot. This is the recommended way to + configure ACS on systems but requires a compatible kernel. + + If the system does not have any need of ACS flags the dropin file will be + removed. This command is intended for Debian style systems with a + /etc/default/grub.d and update-grub command.""" + topo = PCITopo() + if not topo.has_cx_dma: + if args.dry_run: + raise CommandError("No ConnectX DMA Direct functions detected") + if os.path.exists(args.output): + os.unlink(args.output) + return + + acs = topo.compute_acs() + config_acs = [ + f"{acs}@{pdev.bdf}" + for pdev, acs in sorted(acs.items(), key=lambda x: x[0].bdf) + if acs != KERNEL_ACS_ISOLATED + ] + acs_arg = ";".join(config_acs) + grub_conf = [ + f"# Generated by {sys.argv[0]} do not change. ACS settings for RDMA GPU Direct", + f'GRUB_CMDLINE_LINUX="$GRUB_CMDLINE_LINUX pci=config_acs=\\"{acs_arg}\\""', + ] + grub_conf = "\n".join(grub_conf) + + if args.dry_run: + print(grub_conf) + return + + try: + os.makedirs(os.path.dirname(args.output)) + except FileExistsError: + pass + if update_file(args.output, grub_conf + "\n"): + subprocess.check_call(["update-grub"]) + + +# ------------------------------------------------------------------- +def combine_acs(cur_acs, new_acs): + for idx, val in enumerate(new_acs[::-1]): + if val == "1": + cur_acs = cur_acs | (1 << idx) + elif val == "0": + cur_acs = cur_acs & (0xFFFF ^ (1 << idx)) + return cur_acs + + +def args_setpci_acs(parser): + parser.add_argument( + "-n", + "--dry-run", + action="store_true", + dest="dry_run", + help="Output the setpci commands to stdout and make no changes", + ) + + +def cmd_setpci_acs(args): + """Execute a series of set_pci commands that will immediately change the ACS + settings to the required values. This is compatible with older kernels, but + is not recommended. The kernel must boot with ACS enabled and the GPU driver + must have the NVreg_GrdmaPciTopoCheckOverride=1 reg key set to disable + safety checks that old kernels cannot support. + + NOTE: In this configuration unprivileged userspace can trigger platform RAS + failures, use with caution! + """ + topo = PCITopo() + acs = topo.compute_acs() + cmds: List[List[str]] = [] + for pdev, acs in sorted(acs.items(), key=lambda x: x[0].bdf): + cur_acs = pdev.read_config("ECAP_ACS+0x6.w") + new_acs = combine_acs(cur_acs, acs) + if new_acs == cur_acs: + continue + + cmd = ["setpci", "-r", "-s", str(pdev.bdf), f"ECAP_ACS+0x6.w={new_acs:04x}"] + cmds.append(cmd) + if args.dry_run: + for cmd in cmds: + print(" ".join(cmd)) + return + for cmd in cmds: + subprocess.check_call(cmd) + + +# ------------------------------------------------------------------- +def args_check(parser): + pass + + +def check_ok(msg: str): + print(f"OK\t{msg}") + + +def check_fail(msg: str): + print(f"FAIL\t{msg}") + sys.exit(100) + + +def cmd_check(args): + """Check that the running kernel and PCI environment are setup correctly for + GPU Direct with ConnectX DMA Direct PCI functions.""" + topo = PCITopo() + if not topo.has_cx_dma: + raise CommandError("No ConnectX DMA Direct functions detected") + check_ok("All ConnectX DMA functions have correct PCI topology") + + acs = topo.compute_acs() + for pdev, acs in sorted(acs.items(), key=lambda x: x[0].bdf): + cur_acs = pdev.read_config("ECAP_ACS+0x6.w") + new_acs = combine_acs(cur_acs, acs) + if new_acs == cur_acs: + check_ok( + f"ACS for {pdev.device_type} {pdev.bdf} has correct values {cur_acs:07b} = {acs}" + ) + else: + check_fail( + f"ACS for {pdev.device_type} {pdev.bdf} has incorrect values {cur_acs:07b} != {acs}, (0x{cur_acs:x} != 0x{new_acs:x})" + ) + + # Correct iommu_groups are required to avoid NVreg_GrdmaPciTopoCheckOverride + for nvcx in topo.nvcxs: + if ( + nvcx.cx_dma.iommu_group == nvcx.nvgpu.iommu_group + and nvcx.cx_dma.iommu_group is not None + ): + check_ok( + f"Kernel iommu_group for DMA {nvcx.cx_dma.bdf} and GPU {nvcx.nvgpu.bdf} are both {nvcx.cx_dma.iommu_group}" + ) + else: + check_fail( + f"Kernel iommu_group for DMA {nvcx.cx_dma.bdf} and GPU {nvcx.nvgpu.bdf} are not equal {nvcx.cx_dma.iommu_group} != {nvcx.nvgpu.iommu_group}" + ) + + +# ------------------------------------------------------------------- +def load_all_commands(name): + module = importlib.import_module(name) + for k in dir(module): + fn = getattr(module, k) + argsfn = getattr(module, "args_" + k[4:], None) + if argsfn is None or not k.startswith("cmd_") or not inspect.isfunction(fn): + continue + yield (k, fn, argsfn) + + +def get_cmd_aliases(fn): + if hasattr(fn, "__aliases__"): + return fn.__aliases__ + return () + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""NVIDIA ConnectX GPU Direct ACS tool for Direct NIC platforms + +This tool is used to view and control the PCI Access Control Flags (ACS) related +to the Direct NIC topology on supported NVIDIA platforms with ConnectX and +Blackwell family GPUs. + +Direct NIC platforms have a unique multipath PCI topology where the ConnectX +has a main PCI function and a related DMA Direct function linked to the GPU. + +This platform requires specific ACS flags in the PCI topology for reliable +operation, this tool helps users generate ACS settings for the local system. +""", + ) + subparsers = parser.add_subparsers(title="Sub Commands", dest="command") + subparsers.required = True + + commands = [I for I in load_all_commands(__name__)] + commands.sort() + + # build sub parsers for all the loaded commands + for k, fn, argsfn in commands: + sparser = subparsers.add_parser( + k[4:].replace("_", "-"), aliases=get_cmd_aliases(fn), help=fn.__doc__ + ) + sparser.required = True + argsfn(sparser) + sparser.set_defaults(func=fn) + + try: + import argcomplete + + argcomplete.autocomplete(parser) + except ImportError: + pass + + # argparse will set 'func' to the cmd_* that executes this command + args = parser.parse_args() + try: + args.func(args) + except CommandError as e: + print(f"E: {e}") + sys.exit(100) + + +main() diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec index a57743d12..e3b07f13c 100644 --- a/redhat/rdma-core.spec +++ b/redhat/rdma-core.spec @@ -446,6 +446,7 @@ fi %{_libexecdir}/truescale-serdes.cmds %{_sbindir}/rdma-ndd %{_unitdir}/rdma-ndd.service +%{_sbindir}/rdma_topo %{_mandir}/man7/rxe* %{_mandir}/man8/rdma-ndd.* %license COPYING.* diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec index 869fa2580..aa8df4a5f 100644 --- a/suse/rdma-core.spec +++ b/suse/rdma-core.spec @@ -663,6 +663,7 @@ done %{_modprobedir}/50-libmlx4.conf %{_libexecdir}/mlx4-setup.sh %{_libexecdir}/truescale-serdes.cmds +%{_sbindir}/rdma_topo %license COPYING.* %if 0%{?suse_version} < 1600 %{_sbindir}/rcrdma