@@ -49,14 +49,25 @@ type deviceInfo struct {
4949 ResourceName string
5050}
5151
52+ // deviceInfoCacheKey uniquely identifies a device within a driver/pool combination.
53+ type deviceInfoCacheKey struct {
54+ driverPool string // "driver/pool"
55+ deviceName string
56+ }
57+
5258type ClientInterface interface {
53- GetPodResourceMap (pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error
59+ GetPodResourceMap (ctx context. Context , pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error
5460}
5561
5662type draClient struct {
5763 client resourcev1.ResourceV1Interface
58- // One driver/pool may span multiple ResourceSlice objects (e.g. per NUMA zone).
59- resourceSliceCache map [string ][]* resourcev1api.ResourceSlice
64+ // deviceInfoCache stores lightweight device attributes extracted from ResourceSlices.
65+ // Keys are (driver/pool, deviceName); only the two attributes Multus reads are kept,
66+ // so full ResourceSlice objects (~400KB each) are GC'd immediately after listing.
67+ deviceInfoCache map [deviceInfoCacheKey ]* deviceInfo
68+ // populatedDrivers tracks which "nodeName/driverName" combinations have already been
69+ // fetched from the API, preventing redundant List calls within a client's lifetime.
70+ populatedDrivers map [string ]bool
6071 // Keys are namespace/claimName (ResourceClaim is namespaced).
6172 resourceClaimCache map [string ]* resourcev1api.ResourceClaim
6273}
@@ -65,17 +76,20 @@ func NewClient(client resourcev1.ResourceV1Interface) ClientInterface {
6576 logging .Debugf ("NewClient: creating new DRA client" )
6677 return & draClient {
6778 client : client ,
68- resourceSliceCache : make (map [string ][]* resourcev1api.ResourceSlice ),
79+ deviceInfoCache : make (map [deviceInfoCacheKey ]* deviceInfo ),
80+ populatedDrivers : make (map [string ]bool ),
6981 resourceClaimCache : make (map [string ]* resourcev1api.ResourceClaim ),
7082 }
7183}
7284
73- func (d * draClient ) GetPodResourceMap (pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error {
85+ func (d * draClient ) GetPodResourceMap (ctx context. Context , pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error {
7486 logging .Verbosef ("GetPodResourceMap: processing DRA resources for pod %s/%s" , pod .Namespace , pod .Name )
7587
76- ctx , cancel := context .WithTimeout (context . Background () , 20 * time .Second )
88+ ctx , cancel := context .WithTimeout (ctx , 20 * time .Second )
7789 defer cancel ()
7890
91+ nodeName := pod .Spec .NodeName
92+
7993 for _ , claimResource := range pod .Status .ResourceClaimStatuses {
8094 if claimResource .ResourceClaimName == nil {
8195 logging .Errorf ("GetPodResourceMap: resource claim status has nil ResourceClaimName" )
@@ -111,7 +125,7 @@ func (d *draClient) GetPodResourceMap(pod *v1.Pod, resourceMap map[string]*types
111125 logging .Debugf ("GetPodResourceMap: processing device allocation - driver: %s, pool: %s, device: %s, request: %s" ,
112126 result .Driver , result .Pool , result .Device , result .Request )
113127
114- info , err := d .getDeviceInfo (ctx , result )
128+ info , err := d .getDeviceInfo (ctx , nodeName , result )
115129 if err != nil {
116130 if errors .Is (err , errDeviceNotInAnySlice ) {
117131 logging .Warningf (
@@ -151,7 +165,7 @@ func (d *draClient) GetPodResourceMap(pod *v1.Pod, resourceMap map[string]*types
151165 }
152166
153167 if pod .Status .ExtendedResourceClaimStatus != nil {
154- if err := d .processExtendedResourceClaimStatus (ctx , pod , resourceMap ); err != nil {
168+ if err := d .processExtendedResourceClaimStatus (ctx , nodeName , pod , resourceMap ); err != nil {
155169 return err
156170 }
157171 }
@@ -164,7 +178,7 @@ func (d *draClient) GetPodResourceMap(pod *v1.Pod, resourceMap map[string]*types
164178// processExtendedResourceClaimStatus fills the resource map for pods that use
165179// the extended resource feature gate (pod.Status.ExtendedResourceClaimStatus).
166180// Keys come from requestMappings[].resourceName (same as NAD annotation).
167- func (d * draClient ) processExtendedResourceClaimStatus (ctx context.Context , pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error {
181+ func (d * draClient ) processExtendedResourceClaimStatus (ctx context.Context , nodeName string , pod * v1.Pod , resourceMap map [string ]* types.ResourceInfo ) error {
168182 extStatus := pod .Status .ExtendedResourceClaimStatus
169183 claimName := extStatus .ResourceClaimName
170184 claimCacheKey := namespacedClaimCacheKey (pod .Namespace , claimName )
@@ -201,7 +215,7 @@ func (d *draClient) processExtendedResourceClaimStatus(ctx context.Context, pod
201215
202216 resourceMapKey := mapping .ResourceName
203217 for _ , result := range results {
204- info , err := d .getDeviceInfo (ctx , result )
218+ info , err := d .getDeviceInfo (ctx , nodeName , result )
205219 if err != nil {
206220 logging .Errorf ("GetPodResourceMap: failed to get device info for extended resource claim %s request %s: %v" , claimName , mapping .RequestName , err )
207221 return err
@@ -234,82 +248,79 @@ func (d *draClient) processExtendedResourceClaimStatus(ctx context.Context, pod
234248 return nil
235249}
236250
237- func (d * draClient ) getDeviceInfo (ctx context.Context , result resourcev1api.DeviceRequestAllocationResult ) (* deviceInfo , error ) {
238- key := fmt .Sprintf ("%s/%s" , result .Driver , result .Pool )
239- logging .Debugf ("getDeviceInfo: looking up device for driver/pool: %s, device: %s" , key , result .Device )
251+ // ensureDriverCachePopulated lists ResourceSlices for the given node and driver (using server-side
252+ // field selectors) and extracts only the two attributes Multus needs into deviceInfoCache.
253+ // Full ResourceSlice objects are discarded after extraction, keeping memory usage minimal.
254+ // Subsequent calls for the same node/driver combination are no-ops.
255+ func (d * draClient ) ensureDriverCachePopulated (ctx context.Context , nodeName , driverName string ) error {
256+ populatedKey := nodeName + "/" + driverName
257+ if d .populatedDrivers [populatedKey ] {
258+ return nil
259+ }
240260
241- resourceSlices , ok := d .resourceSliceCache [key ]
242- if ! ok {
243- logging .Debugf ("getDeviceInfo: resource slices for %s not in cache, fetching from API" , key )
244- // TODO: Use server-side field selector once spec.driver is supported by the API.
245- // Currently, ResourceSlice does not support field selection on spec.driver,
246- // requiring client-side filtering which may impact performance in very large clusters.
247- listOptions := metav1.ListOptions {}
248- allResourceSlices , err := d .client .ResourceSlices ().List (ctx , listOptions )
249- if err != nil {
250- logging .Errorf ("getDeviceInfo: failed to list resource slices: %v" , err )
251- return nil , err
252- }
261+ listOptions := metav1.ListOptions {}
262+ if nodeName != "" && driverName != "" {
263+ listOptions .FieldSelector = fmt .Sprintf ("spec.nodeName=%s,spec.driver=%s" , nodeName , driverName )
264+ } else if nodeName != "" {
265+ listOptions .FieldSelector = fmt .Sprintf ("spec.nodeName=%s" , nodeName )
266+ }
253267
254- var matchingSlices []* resourcev1api.ResourceSlice
255- for i := range allResourceSlices .Items {
256- slice := & allResourceSlices .Items [i ]
257- if slice .Spec .Driver == result .Driver && slice .Spec .Pool .Name == result .Pool {
258- matchingSlices = append (matchingSlices , slice )
268+ logging .Debugf ("ensureDriverCachePopulated: listing ResourceSlices (fieldSelector=%q)" , listOptions .FieldSelector )
269+ slices , err := d .client .ResourceSlices ().List (ctx , listOptions )
270+ if err != nil {
271+ logging .Errorf ("ensureDriverCachePopulated: failed to list resource slices: %v" , err )
272+ return err
273+ }
274+ logging .Debugf ("ensureDriverCachePopulated: listed %d ResourceSlice(s) for node=%q driver=%q" , len (slices .Items ), nodeName , driverName )
275+
276+ for i := range slices .Items {
277+ slice := & slices .Items [i ]
278+ driverPool := fmt .Sprintf ("%s/%s" , slice .Spec .Driver , slice .Spec .Pool .Name )
279+ for _ , device := range slice .Spec .Devices {
280+ key := deviceInfoCacheKey {driverPool : driverPool , deviceName : device .Name }
281+ info := & deviceInfo {}
282+ if attr , ok := device .Attributes [multusDeviceIDAttr ]; ok && attr .StringValue != nil {
283+ info .DeviceID = * attr .StringValue
284+ }
285+ if attr , ok := device .Attributes [multusResourceNameAttr ]; ok && attr .StringValue != nil {
286+ info .ResourceName = * attr .StringValue
287+ }
288+ if info .DeviceID != "" {
289+ d .deviceInfoCache [key ] = info
259290 }
260291 }
261-
262- if len (matchingSlices ) == 0 {
263- listErr := fmt .Errorf ("no resource slice found for driver/pool %s" , key )
264- logging .Errorf ("getDeviceInfo: %v" , listErr )
265- return nil , listErr
266- }
267- resourceSlices = matchingSlices
268- d .resourceSliceCache [key ] = resourceSlices
269- logging .Debugf ("getDeviceInfo: cached %d resource slices for %s" , len (resourceSlices ), key )
270- } else {
271- logging .Debugf ("getDeviceInfo: using cached %d resource slices for %s" , len (resourceSlices ), key )
272292 }
273293
274- for _ , resourceSlice := range resourceSlices {
275- logging .Debugf ("getDeviceInfo: searching for device %s in slice %s with %d devices" , result .Device , resourceSlice .Name , len (resourceSlice .Spec .Devices ))
276- for _ , device := range resourceSlice .Spec .Devices {
277- if device .Name != result .Device {
278- continue
279- }
280- logging .Debugf ("getDeviceInfo: found device %s, checking attributes" , device .Name )
294+ d .populatedDrivers [populatedKey ] = true
295+ return nil
296+ }
281297
282- devIDAttr , exists := device .Attributes [multusDeviceIDAttr ]
283- if ! exists {
284- logging .Warningf (
285- "getDeviceInfo: device %q (driver %q, pool %q) has no %q in ResourceSlice; skipping allocation result" ,
286- device .Name , result .Driver , result .Pool , multusDeviceIDAttr )
287- return nil , fmt .Errorf ("%w: device %q present in slice but missing %q" ,
288- errDeviceNotInAnySlice , device .Name , multusDeviceIDAttr )
289- }
298+ func (d * draClient ) getDeviceInfo (ctx context.Context , nodeName string , result resourcev1api.DeviceRequestAllocationResult ) (* deviceInfo , error ) {
299+ driverPool := fmt .Sprintf ("%s/%s" , result .Driver , result .Pool )
300+ logging .Debugf ("getDeviceInfo: looking up device for driver/pool: %s, device: %s" , driverPool , result .Device )
290301
291- if devIDAttr .StringValue == nil {
292- logging .Warningf (
293- "getDeviceInfo: device %q (driver %q, pool %q) has %q with nil StringValue; skipping allocation result" ,
294- device .Name , result .Driver , result .Pool , multusDeviceIDAttr )
295- return nil , fmt .Errorf ("%w: device %q has nil StringValue for %q" ,
296- errDeviceNotInAnySlice , device .Name , multusDeviceIDAttr )
297- }
298- info := & deviceInfo {DeviceID : * devIDAttr .StringValue }
302+ if err := d .ensureDriverCachePopulated (ctx , nodeName , result .Driver ); err != nil {
303+ return nil , err
304+ }
299305
300- if resNameAttr , ok := device .Attributes [multusResourceNameAttr ]; ok && resNameAttr .StringValue != nil {
301- info .ResourceName = * resNameAttr .StringValue
302- logging .Debugf ("getDeviceInfo: device %s has %s %s" , device .Name , multusResourceNameAttr , info .ResourceName )
303- }
306+ key := deviceInfoCacheKey {driverPool : driverPool , deviceName : result .Device }
307+ info , ok := d .deviceInfoCache [key ]
308+ if ! ok {
309+ notFoundErr := fmt .Errorf ("%w: device %s not found for claim resource %s/%s in any matching resource slice" ,
310+ errDeviceNotInAnySlice , result .Device , result .Driver , result .Pool )
311+ logging .Errorf ("getDeviceInfo: %v" , notFoundErr )
312+ return nil , notFoundErr
313+ }
304314
305- logging .Verbosef ("getDeviceInfo: successfully retrieved info for device %s (driver/pool: %s): deviceID=%s, resourceName=%s" ,
306- result .Device , key , info .DeviceID , info .ResourceName )
307- return info , nil
308- }
315+ if info .DeviceID == "" {
316+ logging .Warningf (
317+ "getDeviceInfo: device %q (driver %q, pool %q) has no %q in ResourceSlice; skipping allocation result" ,
318+ result .Device , result .Driver , result .Pool , multusDeviceIDAttr )
319+ return nil , fmt .Errorf ("%w: device %q present in slice but missing %q" ,
320+ errDeviceNotInAnySlice , result .Device , multusDeviceIDAttr )
309321 }
310322
311- notFoundErr := fmt .Errorf ("%w: device %s not found for claim resource %s/%s in any matching resource slice" ,
312- errDeviceNotInAnySlice , result .Device , result .Driver , result .Pool )
313- logging .Errorf ("getDeviceInfo: %v" , notFoundErr )
314- return nil , notFoundErr
323+ logging .Verbosef ("getDeviceInfo: successfully retrieved info for device %s (driver/pool: %s): deviceID=%s, resourceName=%s" ,
324+ result .Device , driverPool , info .DeviceID , info .ResourceName )
325+ return info , nil
315326}
0 commit comments