1
16
17 package memorymanager
18
19 import (
20 "fmt"
21 "reflect"
22 "sort"
23
24 cadvisorapi "github.com/google/cadvisor/info/v1"
25
26 v1 "k8s.io/api/core/v1"
27 "k8s.io/apimachinery/pkg/api/resource"
28 utilfeature "k8s.io/apiserver/pkg/util/feature"
29 "k8s.io/klog/v2"
30 podutil "k8s.io/kubernetes/pkg/api/v1/pod"
31 corehelper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
32 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
33 "k8s.io/kubernetes/pkg/features"
34 "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
35 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
36 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
37 "k8s.io/kubernetes/pkg/kubelet/metrics"
38 "k8s.io/kubernetes/pkg/kubelet/types"
39 )
40
41 const policyTypeStatic policyType = "Static"
42
43 type systemReservedMemory map[int]map[v1.ResourceName]uint64
44 type reusableMemory map[string]map[string]map[v1.ResourceName]uint64
45
46
47 type staticPolicy struct {
48
49 machineInfo *cadvisorapi.MachineInfo
50
51 systemReserved systemReservedMemory
52
53 affinity topologymanager.Store
54
55
56
57
58 initContainersReusableMemory reusableMemory
59 }
60
61 var _ Policy = &staticPolicy{}
62
63
64 func NewPolicyStatic(machineInfo *cadvisorapi.MachineInfo, reserved systemReservedMemory, affinity topologymanager.Store) (Policy, error) {
65 var totalSystemReserved uint64
66 for _, node := range reserved {
67 if _, ok := node[v1.ResourceMemory]; !ok {
68 continue
69 }
70 totalSystemReserved += node[v1.ResourceMemory]
71 }
72
73
74 if totalSystemReserved <= 0 {
75 return nil, fmt.Errorf("[memorymanager] you should specify the system reserved memory")
76 }
77
78 return &staticPolicy{
79 machineInfo: machineInfo,
80 systemReserved: reserved,
81 affinity: affinity,
82 initContainersReusableMemory: reusableMemory{},
83 }, nil
84 }
85
86 func (p *staticPolicy) Name() string {
87 return string(policyTypeStatic)
88 }
89
90 func (p *staticPolicy) Start(s state.State) error {
91 if err := p.validateState(s); err != nil {
92 klog.ErrorS(err, "Invalid state, please drain node and remove policy state file")
93 return err
94 }
95 return nil
96 }
97
98
99 func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
100
101 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
102 return nil
103 }
104
105 podUID := string(pod.UID)
106 klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
107
108 metrics.MemoryManagerPinningRequestTotal.Inc()
109 defer func() {
110 if rerr != nil {
111 metrics.MemoryManagerPinningErrorsTotal.Inc()
112 }
113 }()
114 if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
115 p.updatePodReusableMemory(pod, container, blocks)
116
117 klog.InfoS("Container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
118 return nil
119 }
120
121
122 hint := p.affinity.GetAffinity(podUID, container.Name)
123 klog.InfoS("Got topology affinity", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "hint", hint)
124
125 requestedResources, err := getRequestedResources(pod, container)
126 if err != nil {
127 return err
128 }
129
130 machineState := s.GetMachineState()
131 bestHint := &hint
132
133
134 if hint.NUMANodeAffinity == nil {
135 defaultHint, err := p.getDefaultHint(machineState, pod, requestedResources)
136 if err != nil {
137 return err
138 }
139
140 if !defaultHint.Preferred && bestHint.Preferred {
141 return fmt.Errorf("[memorymanager] failed to find the default preferred hint")
142 }
143 bestHint = defaultHint
144 }
145
146
147
148 if !isAffinitySatisfyRequest(machineState, bestHint.NUMANodeAffinity, requestedResources) {
149 extendedHint, err := p.extendTopologyManagerHint(machineState, pod, requestedResources, bestHint.NUMANodeAffinity)
150 if err != nil {
151 return err
152 }
153
154 if !extendedHint.Preferred && bestHint.Preferred {
155 return fmt.Errorf("[memorymanager] failed to find the extended preferred hint")
156 }
157 bestHint = extendedHint
158 }
159
160 var containerBlocks []state.Block
161 maskBits := bestHint.NUMANodeAffinity.GetBits()
162 for resourceName, requestedSize := range requestedResources {
163
164 containerBlocks = append(containerBlocks, state.Block{
165 NUMAAffinity: maskBits,
166 Size: requestedSize,
167 Type: resourceName,
168 })
169
170 podReusableMemory := p.getPodReusableMemory(pod, bestHint.NUMANodeAffinity, resourceName)
171 if podReusableMemory >= requestedSize {
172 requestedSize = 0
173 } else {
174 requestedSize -= podReusableMemory
175 }
176
177
178 p.updateMachineState(machineState, maskBits, resourceName, requestedSize)
179 }
180
181 p.updatePodReusableMemory(pod, container, containerBlocks)
182
183 s.SetMachineState(machineState)
184 s.SetMemoryBlocks(podUID, container.Name, containerBlocks)
185
186
187
188
189
190
191
192 p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks)
193
194 return nil
195 }
196
197 func (p *staticPolicy) updateMachineState(machineState state.NUMANodeMap, numaAffinity []int, resourceName v1.ResourceName, requestedSize uint64) {
198 for _, nodeID := range numaAffinity {
199 machineState[nodeID].NumberOfAssignments++
200 machineState[nodeID].Cells = numaAffinity
201
202
203 if requestedSize == 0 {
204 continue
205 }
206
207
208 nodeResourceMemoryState := machineState[nodeID].MemoryMap[resourceName]
209 if nodeResourceMemoryState.Free <= 0 {
210 continue
211 }
212
213
214 if nodeResourceMemoryState.Free >= requestedSize {
215 nodeResourceMemoryState.Reserved += requestedSize
216 nodeResourceMemoryState.Free -= requestedSize
217 requestedSize = 0
218 continue
219 }
220
221
222 requestedSize -= nodeResourceMemoryState.Free
223 nodeResourceMemoryState.Reserved += nodeResourceMemoryState.Free
224 nodeResourceMemoryState.Free = 0
225 }
226 }
227
228 func (p *staticPolicy) getPodReusableMemory(pod *v1.Pod, numaAffinity bitmask.BitMask, resourceName v1.ResourceName) uint64 {
229 podReusableMemory, ok := p.initContainersReusableMemory[string(pod.UID)]
230 if !ok {
231 return 0
232 }
233
234 numaReusableMemory, ok := podReusableMemory[numaAffinity.String()]
235 if !ok {
236 return 0
237 }
238
239 return numaReusableMemory[resourceName]
240 }
241
242
243 func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) {
244 blocks := s.GetMemoryBlocks(podUID, containerName)
245 if blocks == nil {
246 return
247 }
248
249 klog.InfoS("RemoveContainer", "podUID", podUID, "containerName", containerName)
250 s.Delete(podUID, containerName)
251
252
253 machineState := s.GetMachineState()
254 for _, b := range blocks {
255 releasedSize := b.Size
256 for _, nodeID := range b.NUMAAffinity {
257 machineState[nodeID].NumberOfAssignments--
258
259
260 if machineState[nodeID].NumberOfAssignments == 0 {
261 machineState[nodeID].Cells = []int{nodeID}
262 }
263
264
265 if releasedSize == 0 {
266 continue
267 }
268
269 nodeResourceMemoryState := machineState[nodeID].MemoryMap[b.Type]
270
271
272 if nodeResourceMemoryState.Reserved == 0 {
273 continue
274 }
275
276
277
278 if nodeResourceMemoryState.Reserved < releasedSize {
279 releasedSize -= nodeResourceMemoryState.Reserved
280 nodeResourceMemoryState.Free += nodeResourceMemoryState.Reserved
281 nodeResourceMemoryState.Reserved = 0
282 continue
283 }
284
285
286 nodeResourceMemoryState.Free += releasedSize
287 nodeResourceMemoryState.Reserved -= releasedSize
288 releasedSize = 0
289 }
290 }
291
292 s.SetMachineState(machineState)
293 }
294
295 func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, reqRsrc map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint {
296 hints := map[string][]topologymanager.TopologyHint{}
297 for resourceName := range reqRsrc {
298 hints[string(resourceName)] = []topologymanager.TopologyHint{}
299 }
300
301 if len(ctnBlocks) != len(reqRsrc) {
302 klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name)
303 return nil
304 }
305
306 for _, b := range ctnBlocks {
307 if _, ok := reqRsrc[b.Type]; !ok {
308 klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type)
309 return nil
310 }
311
312 if b.Size != reqRsrc[b.Type] {
313 klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
314 return nil
315 }
316
317 containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...)
318 if err != nil {
319 klog.ErrorS(err, "Failed to generate NUMA bitmask")
320 return nil
321 }
322
323 klog.InfoS("Regenerating TopologyHints, resource was already allocated to pod", "resourceName", b.Type, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", ctn.Name)
324 hints[string(b.Type)] = append(hints[string(b.Type)], topologymanager.TopologyHint{
325 NUMANodeAffinity: containerNUMAAffinity,
326 Preferred: true,
327 })
328 }
329 return hints
330 }
331
332 func getPodRequestedResources(pod *v1.Pod) (map[v1.ResourceName]uint64, error) {
333
334 reqRsrcsByInitCtrs := make(map[v1.ResourceName]uint64)
335
336 reqRsrcsByRestartableInitCtrs := make(map[v1.ResourceName]uint64)
337 for _, ctr := range pod.Spec.InitContainers {
338 reqRsrcs, err := getRequestedResources(pod, &ctr)
339
340 if err != nil {
341 return nil, err
342 }
343 for rsrcName, qty := range reqRsrcs {
344 if _, ok := reqRsrcsByInitCtrs[rsrcName]; !ok {
345 reqRsrcsByInitCtrs[rsrcName] = uint64(0)
346 }
347
348
349
350 if types.IsRestartableInitContainer(&ctr) {
351 reqRsrcsByRestartableInitCtrs[rsrcName] += qty
352 } else if reqRsrcsByRestartableInitCtrs[rsrcName]+qty > reqRsrcsByInitCtrs[rsrcName] {
353 reqRsrcsByInitCtrs[rsrcName] = reqRsrcsByRestartableInitCtrs[rsrcName] + qty
354 }
355 }
356 }
357
358 reqRsrcsByAppCtrs := make(map[v1.ResourceName]uint64)
359 for _, ctr := range pod.Spec.Containers {
360 reqRsrcs, err := getRequestedResources(pod, &ctr)
361
362 if err != nil {
363 return nil, err
364 }
365 for rsrcName, qty := range reqRsrcs {
366 if _, ok := reqRsrcsByAppCtrs[rsrcName]; !ok {
367 reqRsrcsByAppCtrs[rsrcName] = uint64(0)
368 }
369
370 reqRsrcsByAppCtrs[rsrcName] += qty
371 }
372 }
373
374 reqRsrcs := make(map[v1.ResourceName]uint64)
375 for rsrcName := range reqRsrcsByAppCtrs {
376
377 reqRsrcsByLongRunningCtrs := reqRsrcsByAppCtrs[rsrcName] + reqRsrcsByRestartableInitCtrs[rsrcName]
378 reqRsrcs[rsrcName] = reqRsrcsByLongRunningCtrs
379
380 if reqRsrcs[rsrcName] < reqRsrcsByInitCtrs[rsrcName] {
381 reqRsrcs[rsrcName] = reqRsrcsByInitCtrs[rsrcName]
382 }
383 }
384 return reqRsrcs, nil
385 }
386
387 func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
388 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
389 return nil
390 }
391
392 reqRsrcs, err := getPodRequestedResources(pod)
393 if err != nil {
394 klog.ErrorS(err, "Failed to get pod requested resources", "pod", klog.KObj(pod), "podUID", pod.UID)
395 return nil
396 }
397
398 for _, ctn := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
399 containerBlocks := s.GetMemoryBlocks(string(pod.UID), ctn.Name)
400
401
402
403 if containerBlocks != nil {
404 return regenerateHints(pod, &ctn, containerBlocks, reqRsrcs)
405 }
406 }
407
408
409 return p.calculateHints(s.GetMachineState(), pod, reqRsrcs)
410 }
411
412
413
414
415 func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
416 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
417 return nil
418 }
419
420 requestedResources, err := getRequestedResources(pod, container)
421 if err != nil {
422 klog.ErrorS(err, "Failed to get container requested resources", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name)
423 return nil
424 }
425
426 containerBlocks := s.GetMemoryBlocks(string(pod.UID), container.Name)
427
428
429
430 if containerBlocks != nil {
431 return regenerateHints(pod, container, containerBlocks, requestedResources)
432 }
433
434 return p.calculateHints(s.GetMachineState(), pod, requestedResources)
435 }
436
437 func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.ResourceName]uint64, error) {
438 requestedResources := map[v1.ResourceName]uint64{}
439 resources := container.Resources.Requests
440
441
442
443
444 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
445 if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
446 resources = cs.AllocatedResources
447 }
448 }
449 for resourceName, quantity := range resources {
450 if resourceName != v1.ResourceMemory && !corehelper.IsHugePageResourceName(resourceName) {
451 continue
452 }
453 requestedSize, succeed := quantity.AsInt64()
454 if !succeed {
455 return nil, fmt.Errorf("[memorymanager] failed to represent quantity as int64")
456 }
457 requestedResources[resourceName] = uint64(requestedSize)
458 }
459 return requestedResources, nil
460 }
461
462 func (p *staticPolicy) calculateHints(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint {
463 var numaNodes []int
464 for n := range machineState {
465 numaNodes = append(numaNodes, n)
466 }
467 sort.Ints(numaNodes)
468
469
470 minAffinitySize := len(numaNodes)
471
472 hints := map[string][]topologymanager.TopologyHint{}
473 bitmask.IterateBitMasks(numaNodes, func(mask bitmask.BitMask) {
474 maskBits := mask.GetBits()
475 singleNUMAHint := len(maskBits) == 1
476
477 totalFreeSize := map[v1.ResourceName]uint64{}
478 totalAllocatableSize := map[v1.ResourceName]uint64{}
479
480 for _, nodeID := range maskBits {
481 for resourceName := range requestedResources {
482 if _, ok := totalFreeSize[resourceName]; !ok {
483 totalFreeSize[resourceName] = 0
484 }
485 totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free
486
487 if _, ok := totalAllocatableSize[resourceName]; !ok {
488 totalAllocatableSize[resourceName] = 0
489 }
490 totalAllocatableSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Allocatable
491 }
492 }
493
494
495 for resourceName, requestedSize := range requestedResources {
496 if totalAllocatableSize[resourceName] < requestedSize {
497 return
498 }
499 }
500
501
502 if mask.Count() < minAffinitySize {
503 minAffinitySize = mask.Count()
504 }
505
506
507 if singleNUMAHint && len(machineState[maskBits[0]].Cells) > 1 {
508 return
509 }
510
511 for _, nodeID := range maskBits {
512
513 if !singleNUMAHint && machineState[nodeID].NumberOfAssignments > 0 {
514
515 if len(machineState[nodeID].Cells) == 1 {
516 return
517 }
518
519
520 if !areGroupsEqual(machineState[nodeID].Cells, maskBits) {
521 return
522 }
523 }
524 }
525
526
527 for resourceName, requestedSize := range requestedResources {
528 podReusableMemory := p.getPodReusableMemory(pod, mask, resourceName)
529 if totalFreeSize[resourceName]+podReusableMemory < requestedSize {
530 return
531 }
532 }
533
534
535 for resourceName := range requestedResources {
536 if _, ok := hints[string(resourceName)]; !ok {
537 hints[string(resourceName)] = []topologymanager.TopologyHint{}
538 }
539 hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.TopologyHint{
540 NUMANodeAffinity: mask,
541 Preferred: false,
542 })
543 }
544 })
545
546
547
548 for resourceName := range requestedResources {
549 for i, hint := range hints[string(resourceName)] {
550 hints[string(resourceName)][i].Preferred = p.isHintPreferred(hint.NUMANodeAffinity.GetBits(), minAffinitySize)
551 }
552 }
553
554 return hints
555 }
556
557 func (p *staticPolicy) isHintPreferred(maskBits []int, minAffinitySize int) bool {
558 return len(maskBits) == minAffinitySize
559 }
560
561 func areGroupsEqual(group1, group2 []int) bool {
562 sort.Ints(group1)
563 sort.Ints(group2)
564
565 if len(group1) != len(group2) {
566 return false
567 }
568
569 for i, elm := range group1 {
570 if group2[i] != elm {
571 return false
572 }
573 }
574 return true
575 }
576
577 func (p *staticPolicy) validateState(s state.State) error {
578 machineState := s.GetMachineState()
579 memoryAssignments := s.GetMemoryAssignments()
580
581 if len(machineState) == 0 {
582
583 if len(memoryAssignments) != 0 {
584 return fmt.Errorf("[memorymanager] machine state can not be empty when it has memory assignments")
585 }
586
587 defaultMachineState := p.getDefaultMachineState()
588 s.SetMachineState(defaultMachineState)
589
590 return nil
591 }
592
593
594 expectedMachineState := p.getDefaultMachineState()
595 for pod, container := range memoryAssignments {
596 for containerName, blocks := range container {
597 for _, b := range blocks {
598 requestedSize := b.Size
599 for _, nodeID := range b.NUMAAffinity {
600 nodeState, ok := expectedMachineState[nodeID]
601 if !ok {
602 return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses the NUMA that does not exist", pod, containerName)
603 }
604
605 nodeState.NumberOfAssignments++
606 nodeState.Cells = b.NUMAAffinity
607
608 memoryState, ok := nodeState.MemoryMap[b.Type]
609 if !ok {
610 return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses memory resource that does not exist", pod, containerName)
611 }
612
613 if requestedSize == 0 {
614 continue
615 }
616
617
618 if memoryState.Free <= 0 {
619 continue
620 }
621
622
623 if memoryState.Free >= requestedSize {
624 memoryState.Reserved += requestedSize
625 memoryState.Free -= requestedSize
626 requestedSize = 0
627 continue
628 }
629
630
631 requestedSize -= memoryState.Free
632 memoryState.Reserved += memoryState.Free
633 memoryState.Free = 0
634 }
635 }
636 }
637 }
638
639
640
641
642
643 if !areMachineStatesEqual(machineState, expectedMachineState) {
644 return fmt.Errorf("[memorymanager] the expected machine state is different from the real one")
645 }
646
647 return nil
648 }
649
650 func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
651 if len(ms1) != len(ms2) {
652 klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
653 return false
654 }
655
656 for nodeID, nodeState1 := range ms1 {
657 nodeState2, ok := ms2[nodeID]
658 if !ok {
659 klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID)
660 return false
661 }
662
663 if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments {
664 klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
665 return false
666 }
667
668 if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) {
669 klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
670 return false
671 }
672
673 if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) {
674 klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
675 return false
676 }
677
678 for resourceName, memoryState1 := range nodeState1.MemoryMap {
679 memoryState2, ok := nodeState2.MemoryMap[resourceName]
680 if !ok {
681 klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName)
682 return false
683 }
684
685 if !reflect.DeepEqual(*memoryState1, *memoryState2) {
686 klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
687 return false
688 }
689 }
690 }
691 return true
692 }
693
694 func (p *staticPolicy) getDefaultMachineState() state.NUMANodeMap {
695 defaultMachineState := state.NUMANodeMap{}
696 nodeHugepages := map[int]uint64{}
697 for _, node := range p.machineInfo.Topology {
698 defaultMachineState[node.Id] = &state.NUMANodeState{
699 NumberOfAssignments: 0,
700 MemoryMap: map[v1.ResourceName]*state.MemoryTable{},
701 Cells: []int{node.Id},
702 }
703
704
705 for _, hugepage := range node.HugePages {
706 hugepageQuantity := resource.NewQuantity(int64(hugepage.PageSize)*1024, resource.BinarySI)
707 resourceName := corehelper.HugePageResourceName(*hugepageQuantity)
708 systemReserved := p.getResourceSystemReserved(node.Id, resourceName)
709 totalHugepagesSize := hugepage.NumPages * hugepage.PageSize * 1024
710 allocatable := totalHugepagesSize - systemReserved
711 defaultMachineState[node.Id].MemoryMap[resourceName] = &state.MemoryTable{
712 Allocatable: allocatable,
713 Free: allocatable,
714 Reserved: 0,
715 SystemReserved: systemReserved,
716 TotalMemSize: totalHugepagesSize,
717 }
718 if _, ok := nodeHugepages[node.Id]; !ok {
719 nodeHugepages[node.Id] = 0
720 }
721 nodeHugepages[node.Id] += totalHugepagesSize
722 }
723
724
725 systemReserved := p.getResourceSystemReserved(node.Id, v1.ResourceMemory)
726
727 allocatable := node.Memory - systemReserved
728
729 if allocatedByHugepages, ok := nodeHugepages[node.Id]; ok {
730 allocatable -= allocatedByHugepages
731 }
732 defaultMachineState[node.Id].MemoryMap[v1.ResourceMemory] = &state.MemoryTable{
733 Allocatable: allocatable,
734 Free: allocatable,
735 Reserved: 0,
736 SystemReserved: systemReserved,
737 TotalMemSize: node.Memory,
738 }
739 }
740 return defaultMachineState
741 }
742
743 func (p *staticPolicy) getResourceSystemReserved(nodeID int, resourceName v1.ResourceName) uint64 {
744 var systemReserved uint64
745 if nodeSystemReserved, ok := p.systemReserved[nodeID]; ok {
746 if nodeMemorySystemReserved, ok := nodeSystemReserved[resourceName]; ok {
747 systemReserved = nodeMemorySystemReserved
748 }
749 }
750 return systemReserved
751 }
752
753 func (p *staticPolicy) getDefaultHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) (*topologymanager.TopologyHint, error) {
754 hints := p.calculateHints(machineState, pod, requestedResources)
755 if len(hints) < 1 {
756 return nil, fmt.Errorf("[memorymanager] failed to get the default NUMA affinity, no NUMA nodes with enough memory is available")
757 }
758
759
760 return findBestHint(hints[string(v1.ResourceMemory)]), nil
761 }
762
763 func isAffinitySatisfyRequest(machineState state.NUMANodeMap, mask bitmask.BitMask, requestedResources map[v1.ResourceName]uint64) bool {
764 totalFreeSize := map[v1.ResourceName]uint64{}
765 for _, nodeID := range mask.GetBits() {
766 for resourceName := range requestedResources {
767 if _, ok := totalFreeSize[resourceName]; !ok {
768 totalFreeSize[resourceName] = 0
769 }
770 totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free
771 }
772 }
773
774
775 for resourceName, requestedSize := range requestedResources {
776 if totalFreeSize[resourceName] < requestedSize {
777 return false
778 }
779 }
780
781 return true
782 }
783
784
785
786
787
788 func (p *staticPolicy) extendTopologyManagerHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64, mask bitmask.BitMask) (*topologymanager.TopologyHint, error) {
789 hints := p.calculateHints(machineState, pod, requestedResources)
790
791 var filteredHints []topologymanager.TopologyHint
792
793 for _, hint := range hints[string(v1.ResourceMemory)] {
794 affinityBits := hint.NUMANodeAffinity.GetBits()
795
796 if isHintInGroup(mask.GetBits(), affinityBits) {
797 filteredHints = append(filteredHints, hint)
798 }
799 }
800
801 if len(filteredHints) < 1 {
802 return nil, fmt.Errorf("[memorymanager] failed to find NUMA nodes to extend the current topology hint")
803 }
804
805
806 return findBestHint(filteredHints), nil
807 }
808
809 func isHintInGroup(hint []int, group []int) bool {
810 sort.Ints(hint)
811 sort.Ints(group)
812
813 hintIndex := 0
814 for i := range group {
815 if hintIndex == len(hint) {
816 return true
817 }
818
819 if group[i] != hint[hintIndex] {
820 continue
821 }
822 hintIndex++
823 }
824
825 return hintIndex == len(hint)
826 }
827
828 func findBestHint(hints []topologymanager.TopologyHint) *topologymanager.TopologyHint {
829
830 bestHint := topologymanager.TopologyHint{}
831 for _, hint := range hints {
832 if bestHint.NUMANodeAffinity == nil {
833 bestHint = hint
834 continue
835 }
836
837
838 if hint.Preferred && !bestHint.Preferred {
839 bestHint = hint
840 continue
841 }
842
843
844 if hint.Preferred == bestHint.Preferred && hint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) {
845 bestHint = hint
846 }
847 }
848 return &bestHint
849 }
850
851
852 func (p *staticPolicy) GetAllocatableMemory(s state.State) []state.Block {
853 var allocatableMemory []state.Block
854 machineState := s.GetMachineState()
855 for numaNodeID, numaNodeState := range machineState {
856 for resourceName, memoryTable := range numaNodeState.MemoryMap {
857 if memoryTable.Allocatable == 0 {
858 continue
859 }
860
861 block := state.Block{
862 NUMAAffinity: []int{numaNodeID},
863 Type: resourceName,
864 Size: memoryTable.Allocatable,
865 }
866 allocatableMemory = append(allocatableMemory, block)
867 }
868 }
869 return allocatableMemory
870 }
871
872 func (p *staticPolicy) updatePodReusableMemory(pod *v1.Pod, container *v1.Container, memoryBlocks []state.Block) {
873 podUID := string(pod.UID)
874
875
876 for uid := range p.initContainersReusableMemory {
877 if podUID != uid {
878 delete(p.initContainersReusableMemory, uid)
879 }
880 }
881
882 if isRegularInitContainer(pod, container) {
883 if _, ok := p.initContainersReusableMemory[podUID]; !ok {
884 p.initContainersReusableMemory[podUID] = map[string]map[v1.ResourceName]uint64{}
885 }
886
887 for _, block := range memoryBlocks {
888 blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...)
889 blockBitMaskString := blockBitMask.String()
890
891 if _, ok := p.initContainersReusableMemory[podUID][blockBitMaskString]; !ok {
892 p.initContainersReusableMemory[podUID][blockBitMaskString] = map[v1.ResourceName]uint64{}
893 }
894
895 if blockReusableMemory := p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type]; block.Size > blockReusableMemory {
896 p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type] = block.Size
897 }
898 }
899
900 return
901 }
902
903
904 for _, block := range memoryBlocks {
905 blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...)
906 if podReusableMemory := p.getPodReusableMemory(pod, blockBitMask, block.Type); podReusableMemory != 0 {
907 if block.Size >= podReusableMemory {
908 p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] = 0
909 } else {
910 p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] -= block.Size
911 }
912 }
913 }
914 }
915
916 func (p *staticPolicy) updateInitContainersMemoryBlocks(s state.State, pod *v1.Pod, container *v1.Container, containerMemoryBlocks []state.Block) {
917 podUID := string(pod.UID)
918
919 for _, containerBlock := range containerMemoryBlocks {
920 blockSize := containerBlock.Size
921 for _, initContainer := range pod.Spec.InitContainers {
922
923 if initContainer.Name == container.Name {
924 break
925 }
926
927 if blockSize == 0 {
928 break
929 }
930
931 if types.IsRestartableInitContainer(&initContainer) {
932
933
934 continue
935 }
936
937 initContainerBlocks := s.GetMemoryBlocks(podUID, initContainer.Name)
938 if len(initContainerBlocks) == 0 {
939 continue
940 }
941
942 for i := range initContainerBlocks {
943 initContainerBlock := &initContainerBlocks[i]
944 if initContainerBlock.Size == 0 {
945 continue
946 }
947
948 if initContainerBlock.Type != containerBlock.Type {
949 continue
950 }
951
952 if !isNUMAAffinitiesEqual(initContainerBlock.NUMAAffinity, containerBlock.NUMAAffinity) {
953 continue
954 }
955
956 if initContainerBlock.Size > blockSize {
957 initContainerBlock.Size -= blockSize
958 blockSize = 0
959 } else {
960 blockSize -= initContainerBlock.Size
961 initContainerBlock.Size = 0
962 }
963 }
964
965 s.SetMemoryBlocks(podUID, initContainer.Name, initContainerBlocks)
966 }
967 }
968 }
969
970 func isRegularInitContainer(pod *v1.Pod, container *v1.Container) bool {
971 for _, initContainer := range pod.Spec.InitContainers {
972 if initContainer.Name == container.Name {
973 return !types.IsRestartableInitContainer(&initContainer)
974 }
975 }
976
977 return false
978 }
979
980 func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool {
981 bitMask1, err := bitmask.NewBitMask(numaAffinity1...)
982 if err != nil {
983 klog.ErrorS(err, "failed to create bit mask", "numaAffinity1", numaAffinity1)
984 return false
985 }
986
987 bitMask2, err := bitmask.NewBitMask(numaAffinity2...)
988 if err != nil {
989 klog.ErrorS(err, "failed to create bit mask", "numaAffinity2", numaAffinity2)
990 return false
991 }
992
993 return bitMask1.IsEqual(bitMask2)
994 }
995
View as plain text