1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cpumanager 18 19 import ( 20 "fmt" 21 "math" 22 "sort" 23 24 "k8s.io/klog/v2" 25 26 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" 27 "k8s.io/utils/cpuset" 28 ) 29 30 // LoopControl controls the behavior of the cpu accumulator loop logic 31 type LoopControl int 32 33 // Possible loop control outcomes 34 const ( 35 Continue LoopControl = iota 36 Break 37 ) 38 39 type mapIntInt map[int]int 40 41 func (m mapIntInt) Clone() mapIntInt { 42 cp := make(mapIntInt, len(m)) 43 for k, v := range m { 44 cp[k] = v 45 } 46 return cp 47 } 48 49 func (m mapIntInt) Keys() []int { 50 var keys []int 51 for k := range m { 52 keys = append(keys, k) 53 } 54 return keys 55 } 56 57 func (m mapIntInt) Values(keys ...int) []int { 58 if keys == nil { 59 keys = m.Keys() 60 } 61 var values []int 62 for _, k := range keys { 63 values = append(values, m[k]) 64 } 65 return values 66 } 67 68 func sum(xs []int) int { 69 var s int 70 for _, x := range xs { 71 s += x 72 } 73 return s 74 } 75 76 func mean(xs []int) float64 { 77 var sum float64 78 for _, x := range xs { 79 sum += float64(x) 80 } 81 m := sum / float64(len(xs)) 82 return math.Round(m*1000) / 1000 83 } 84 85 func standardDeviation(xs []int) float64 { 86 m := mean(xs) 87 var sum float64 88 for _, x := range xs { 89 sum += (float64(x) - m) * (float64(x) - m) 90 } 91 s := math.Sqrt(sum / float64(len(xs))) 92 return math.Round(s*1000) / 1000 93 } 94 95 type numaOrSocketsFirstFuncs interface { 96 takeFullFirstLevel() 97 takeFullSecondLevel() 98 sortAvailableNUMANodes() []int 99 sortAvailableSockets() []int 100 sortAvailableCores() []int 101 } 102 103 type numaFirst struct{ acc *cpuAccumulator } 104 type socketsFirst struct{ acc *cpuAccumulator } 105 106 var _ numaOrSocketsFirstFuncs = (*numaFirst)(nil) 107 var _ numaOrSocketsFirstFuncs = (*socketsFirst)(nil) 108 109 // If NUMA nodes are higher in the memory hierarchy than sockets, then we take 110 // from the set of NUMA Nodes as the first level. 111 func (n *numaFirst) takeFullFirstLevel() { 112 n.acc.takeFullNUMANodes() 113 } 114 115 // If NUMA nodes are higher in the memory hierarchy than sockets, then we take 116 // from the set of sockets as the second level. 117 func (n *numaFirst) takeFullSecondLevel() { 118 n.acc.takeFullSockets() 119 } 120 121 // If NUMA nodes are higher in the memory hierarchy than sockets, then just 122 // sort the NUMA nodes directly, and return them. 123 func (n *numaFirst) sortAvailableNUMANodes() []int { 124 numas := n.acc.details.NUMANodes().UnsortedList() 125 n.acc.sort(numas, n.acc.details.CPUsInNUMANodes) 126 return numas 127 } 128 129 // If NUMA nodes are higher in the memory hierarchy than sockets, then we need 130 // to pull the set of sockets out of each sorted NUMA node, and accumulate the 131 // partial order across them. 132 func (n *numaFirst) sortAvailableSockets() []int { 133 var result []int 134 for _, numa := range n.sortAvailableNUMANodes() { 135 sockets := n.acc.details.SocketsInNUMANodes(numa).UnsortedList() 136 n.acc.sort(sockets, n.acc.details.CPUsInSockets) 137 result = append(result, sockets...) 138 } 139 return result 140 } 141 142 // If NUMA nodes are higher in the memory hierarchy than sockets, then 143 // cores sit directly below sockets in the memory hierarchy. 144 func (n *numaFirst) sortAvailableCores() []int { 145 var result []int 146 for _, socket := range n.acc.sortAvailableSockets() { 147 cores := n.acc.details.CoresInSockets(socket).UnsortedList() 148 n.acc.sort(cores, n.acc.details.CPUsInCores) 149 result = append(result, cores...) 150 } 151 return result 152 } 153 154 // If sockets are higher in the memory hierarchy than NUMA nodes, then we take 155 // from the set of sockets as the first level. 156 func (s *socketsFirst) takeFullFirstLevel() { 157 s.acc.takeFullSockets() 158 } 159 160 // If sockets are higher in the memory hierarchy than NUMA nodes, then we take 161 // from the set of NUMA Nodes as the second level. 162 func (s *socketsFirst) takeFullSecondLevel() { 163 s.acc.takeFullNUMANodes() 164 } 165 166 // If sockets are higher in the memory hierarchy than NUMA nodes, then we need 167 // to pull the set of NUMA nodes out of each sorted Socket, and accumulate the 168 // partial order across them. 169 func (s *socketsFirst) sortAvailableNUMANodes() []int { 170 var result []int 171 for _, socket := range s.sortAvailableSockets() { 172 numas := s.acc.details.NUMANodesInSockets(socket).UnsortedList() 173 s.acc.sort(numas, s.acc.details.CPUsInNUMANodes) 174 result = append(result, numas...) 175 } 176 return result 177 } 178 179 // If sockets are higher in the memory hierarchy than NUMA nodes, then just 180 // sort the sockets directly, and return them. 181 func (s *socketsFirst) sortAvailableSockets() []int { 182 sockets := s.acc.details.Sockets().UnsortedList() 183 s.acc.sort(sockets, s.acc.details.CPUsInSockets) 184 return sockets 185 } 186 187 // If sockets are higher in the memory hierarchy than NUMA nodes, then cores 188 // sit directly below NUMA Nodes in the memory hierarchy. 189 func (s *socketsFirst) sortAvailableCores() []int { 190 var result []int 191 for _, numa := range s.acc.sortAvailableNUMANodes() { 192 cores := s.acc.details.CoresInNUMANodes(numa).UnsortedList() 193 s.acc.sort(cores, s.acc.details.CPUsInCores) 194 result = append(result, cores...) 195 } 196 return result 197 } 198 199 type cpuAccumulator struct { 200 // `topo` describes the layout of CPUs (i.e. hyper-threads if hyperthreading is on) between 201 // cores (i.e. physical CPUs if hyper-threading is on), NUMA nodes, and sockets on the K8s 202 // cluster node. `topo` is never mutated, meaning that as the cpuAccumulator claims CPUs topo is 203 // not modified. Its primary purpose is being a reference of the original (i.e. at the time the 204 // cpuAccumulator was created) topology to learn things such as how many CPUs are on each 205 // socket, NUMA node, etc... . 206 topo *topology.CPUTopology 207 208 // `details` is the set of free CPUs that the cpuAccumulator can claim to accumulate the desired 209 // number of CPUS. When a CPU is claimed, it's removed from `details`. 210 details topology.CPUDetails 211 212 // `numCPUsNeeded` is the number of CPUs that the accumulator still needs to accumulate to reach 213 // the desired number of CPUs. When the cpuAccumulator is created, `numCPUsNeeded` is set to the 214 // total number of CPUs to accumulate. Every time a CPU is claimed, `numCPUsNeeded` is decreased 215 // by 1 until it has value 0, meaning that all the needed CPUs have been accumulated 216 // (success), or a situation where it's bigger than 0 but no more CPUs are available is reached 217 // (failure). 218 numCPUsNeeded int 219 220 // `result` is the set of CPUs that have been accumulated so far. When a CPU is claimed, it's 221 // added to `result`. The cpuAccumulator completed its duty successfully when `result` has 222 // cardinality equal to the total number of CPUs to accumulate. 223 result cpuset.CPUSet 224 225 numaOrSocketsFirst numaOrSocketsFirstFuncs 226 } 227 228 func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator { 229 acc := &cpuAccumulator{ 230 topo: topo, 231 details: topo.CPUDetails.KeepOnly(availableCPUs), 232 numCPUsNeeded: numCPUs, 233 result: cpuset.New(), 234 } 235 236 if topo.NumSockets >= topo.NumNUMANodes { 237 acc.numaOrSocketsFirst = &numaFirst{acc} 238 } else { 239 acc.numaOrSocketsFirst = &socketsFirst{acc} 240 } 241 242 return acc 243 } 244 245 // Returns true if the supplied NUMANode is fully available in `a.details`. 246 // "fully available" means that all the CPUs in it are free. 247 func (a *cpuAccumulator) isNUMANodeFree(numaID int) bool { 248 return a.details.CPUsInNUMANodes(numaID).Size() == a.topo.CPUDetails.CPUsInNUMANodes(numaID).Size() 249 } 250 251 // Returns true if the supplied socket is fully available in `a.details`. 252 // "fully available" means that all the CPUs in it are free. 253 func (a *cpuAccumulator) isSocketFree(socketID int) bool { 254 return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket() 255 } 256 257 // Returns true if the supplied core is fully available in `a.details`. 258 // "fully available" means that all the CPUs in it are free. 259 func (a *cpuAccumulator) isCoreFree(coreID int) bool { 260 return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore() 261 } 262 263 // Returns free NUMA Node IDs as a slice sorted by sortAvailableNUMANodes(). 264 func (a *cpuAccumulator) freeNUMANodes() []int { 265 free := []int{} 266 for _, numa := range a.sortAvailableNUMANodes() { 267 if a.isNUMANodeFree(numa) { 268 free = append(free, numa) 269 } 270 } 271 return free 272 } 273 274 // Returns free socket IDs as a slice sorted by sortAvailableSockets(). 275 func (a *cpuAccumulator) freeSockets() []int { 276 free := []int{} 277 for _, socket := range a.sortAvailableSockets() { 278 if a.isSocketFree(socket) { 279 free = append(free, socket) 280 } 281 } 282 return free 283 } 284 285 // Returns free core IDs as a slice sorted by sortAvailableCores(). 286 func (a *cpuAccumulator) freeCores() []int { 287 free := []int{} 288 for _, core := range a.sortAvailableCores() { 289 if a.isCoreFree(core) { 290 free = append(free, core) 291 } 292 } 293 return free 294 } 295 296 // Returns free CPU IDs as a slice sorted by sortAvailableCPUs(). 297 func (a *cpuAccumulator) freeCPUs() []int { 298 return a.sortAvailableCPUs() 299 } 300 301 // Sorts the provided list of NUMA nodes/sockets/cores/cpus referenced in 'ids' 302 // by the number of available CPUs contained within them (smallest to largest). 303 // The 'getCPU()' parameter defines the function that should be called to 304 // retrieve the list of available CPUs for the type being referenced. If two 305 // NUMA nodes/sockets/cores/cpus have the same number of available CPUs, they 306 // are sorted in ascending order by their id. 307 func (a *cpuAccumulator) sort(ids []int, getCPUs func(ids ...int) cpuset.CPUSet) { 308 sort.Slice(ids, 309 func(i, j int) bool { 310 iCPUs := getCPUs(ids[i]) 311 jCPUs := getCPUs(ids[j]) 312 if iCPUs.Size() < jCPUs.Size() { 313 return true 314 } 315 if iCPUs.Size() > jCPUs.Size() { 316 return false 317 } 318 return ids[i] < ids[j] 319 }) 320 } 321 322 // Sort all NUMA nodes with at least one free CPU. 323 // 324 // If NUMA nodes are higher than sockets in the memory hierarchy, they are sorted by ascending number 325 // of free CPUs that they contain. "higher than sockets in the memory hierarchy" means that NUMA nodes 326 // contain a bigger number of CPUs (free and busy) than sockets, or equivalently that each NUMA node 327 // contains more than one socket. 328 // 329 // If instead NUMA nodes are lower in the memory hierarchy than sockets, they are sorted as follows. 330 // First, they are sorted by number of free CPUs in the sockets that contain them. Then, for each 331 // socket they are sorted by number of free CPUs that they contain. The order is always ascending. 332 // In other words, the relative order of two NUMA nodes is determined as follows: 333 // 1. If the two NUMA nodes belong to different sockets, the NUMA node in the socket with the 334 // smaller amount of free CPUs appears first. 335 // 2. If the two NUMA nodes belong to the same socket, the NUMA node with the smaller amount of free 336 // CPUs appears first. 337 func (a *cpuAccumulator) sortAvailableNUMANodes() []int { 338 return a.numaOrSocketsFirst.sortAvailableNUMANodes() 339 } 340 341 // Sort all sockets with at least one free CPU. 342 // 343 // If sockets are higher than NUMA nodes in the memory hierarchy, they are sorted by ascending number 344 // of free CPUs that they contain. "higher than NUMA nodes in the memory hierarchy" means that 345 // sockets contain a bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each 346 // socket contains more than one NUMA node. 347 // 348 // If instead sockets are lower in the memory hierarchy than NUMA nodes, they are sorted as follows. 349 // First, they are sorted by number of free CPUs in the NUMA nodes that contain them. Then, for each 350 // NUMA node they are sorted by number of free CPUs that they contain. The order is always ascending. 351 // In other words, the relative order of two sockets is determined as follows: 352 // 1. If the two sockets belong to different NUMA nodes, the socket in the NUMA node with the 353 // smaller amount of free CPUs appears first. 354 // 2. If the two sockets belong to the same NUMA node, the socket with the smaller amount of free 355 // CPUs appears first. 356 func (a *cpuAccumulator) sortAvailableSockets() []int { 357 return a.numaOrSocketsFirst.sortAvailableSockets() 358 } 359 360 // Sort all cores with at least one free CPU. 361 // 362 // If sockets are higher in the memory hierarchy than NUMA nodes, meaning that sockets contain a 363 // bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each socket contains 364 // more than one NUMA node, the cores are sorted as follows. First, they are sorted by number of 365 // free CPUs that their sockets contain. Then, for each socket, the cores in it are sorted by number 366 // of free CPUs that their NUMA nodes contain. Then, for each NUMA node, the cores in it are sorted 367 // by number of free CPUs that they contain. The order is always ascending. In other words, the 368 // relative order of two cores is determined as follows: 369 // 1. If the two cores belong to different sockets, the core in the socket with the smaller amount of 370 // free CPUs appears first. 371 // 2. If the two cores belong to the same socket but different NUMA nodes, the core in the NUMA node 372 // with the smaller amount of free CPUs appears first. 373 // 3. If the two cores belong to the same NUMA node and socket, the core with the smaller amount of 374 // free CPUs appears first. 375 // 376 // If instead NUMA nodes are higher in the memory hierarchy than sockets, the sorting happens in the 377 // same way as described in the previous paragraph, except that the priority of NUMA nodes and 378 // sockets is inverted (e.g. first sort the cores by number of free CPUs in their NUMA nodes, then, 379 // for each NUMA node, sort the cores by number of free CPUs in their sockets, etc...). 380 func (a *cpuAccumulator) sortAvailableCores() []int { 381 return a.numaOrSocketsFirst.sortAvailableCores() 382 } 383 384 // Sort all free CPUs. 385 // 386 // If sockets are higher in the memory hierarchy than NUMA nodes, meaning that sockets contain a 387 // bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each socket contains 388 // more than one NUMA node, the CPUs are sorted as follows. First, they are sorted by number of 389 // free CPUs that their sockets contain. Then, for each socket, the CPUs in it are sorted by number 390 // of free CPUs that their NUMA nodes contain. Then, for each NUMA node, the CPUs in it are sorted 391 // by number of free CPUs that their cores contain. Finally, for each core, the CPUs in it are 392 // sorted by numerical ID. The order is always ascending. In other words, the relative order of two 393 // CPUs is determined as follows: 394 // 1. If the two CPUs belong to different sockets, the CPU in the socket with the smaller amount of 395 // free CPUs appears first. 396 // 2. If the two CPUs belong to the same socket but different NUMA nodes, the CPU in the NUMA node 397 // with the smaller amount of free CPUs appears first. 398 // 3. If the two CPUs belong to the same socket and NUMA node but different cores, the CPU in the 399 // core with the smaller amount of free CPUs appears first. 400 // 4. If the two CPUs belong to the same NUMA node, socket, and core, the CPU with the smaller ID 401 // appears first. 402 // 403 // If instead NUMA nodes are higher in the memory hierarchy than sockets, the sorting happens in the 404 // same way as described in the previous paragraph, except that the priority of NUMA nodes and 405 // sockets is inverted (e.g. first sort the CPUs by number of free CPUs in their NUMA nodes, then, 406 // for each NUMA node, sort the CPUs by number of free CPUs in their sockets, etc...). 407 func (a *cpuAccumulator) sortAvailableCPUs() []int { 408 var result []int 409 for _, core := range a.sortAvailableCores() { 410 cpus := a.details.CPUsInCores(core).UnsortedList() 411 sort.Ints(cpus) 412 result = append(result, cpus...) 413 } 414 return result 415 } 416 417 func (a *cpuAccumulator) take(cpus cpuset.CPUSet) { 418 a.result = a.result.Union(cpus) 419 a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result)) 420 a.numCPUsNeeded -= cpus.Size() 421 } 422 423 func (a *cpuAccumulator) takeFullNUMANodes() { 424 for _, numa := range a.freeNUMANodes() { 425 cpusInNUMANode := a.topo.CPUDetails.CPUsInNUMANodes(numa) 426 if !a.needsAtLeast(cpusInNUMANode.Size()) { 427 continue 428 } 429 klog.V(4).InfoS("takeFullNUMANodes: claiming NUMA node", "numa", numa) 430 a.take(cpusInNUMANode) 431 } 432 } 433 434 func (a *cpuAccumulator) takeFullSockets() { 435 for _, socket := range a.freeSockets() { 436 cpusInSocket := a.topo.CPUDetails.CPUsInSockets(socket) 437 if !a.needsAtLeast(cpusInSocket.Size()) { 438 continue 439 } 440 klog.V(4).InfoS("takeFullSockets: claiming socket", "socket", socket) 441 a.take(cpusInSocket) 442 } 443 } 444 445 func (a *cpuAccumulator) takeFullCores() { 446 for _, core := range a.freeCores() { 447 cpusInCore := a.topo.CPUDetails.CPUsInCores(core) 448 if !a.needsAtLeast(cpusInCore.Size()) { 449 continue 450 } 451 klog.V(4).InfoS("takeFullCores: claiming core", "core", core) 452 a.take(cpusInCore) 453 } 454 } 455 456 func (a *cpuAccumulator) takeRemainingCPUs() { 457 for _, cpu := range a.sortAvailableCPUs() { 458 klog.V(4).InfoS("takeRemainingCPUs: claiming CPU", "cpu", cpu) 459 a.take(cpuset.New(cpu)) 460 if a.isSatisfied() { 461 return 462 } 463 } 464 } 465 466 // rangeNUMANodesNeededToSatisfy returns minimum and maximum (in this order) number of NUMA nodes 467 // needed to satisfy the cpuAccumulator's goal of accumulating `a.numCPUsNeeded` CPUs, assuming that 468 // CPU groups have size given by the `cpuGroupSize` argument. 469 func (a *cpuAccumulator) rangeNUMANodesNeededToSatisfy(cpuGroupSize int) (minNumNUMAs, maxNumNUMAs int) { 470 // Get the total number of NUMA nodes in the system. 471 numNUMANodes := a.topo.CPUDetails.NUMANodes().Size() 472 473 // Get the total number of NUMA nodes that have CPUs available on them. 474 numNUMANodesAvailable := a.details.NUMANodes().Size() 475 476 // Get the total number of CPUs in the system. 477 numCPUs := a.topo.CPUDetails.CPUs().Size() 478 479 // Get the total number of 'cpuGroups' in the system. 480 numCPUGroups := (numCPUs-1)/cpuGroupSize + 1 481 482 // Calculate the number of 'cpuGroups' per NUMA Node in the system (rounding up). 483 numCPUGroupsPerNUMANode := (numCPUGroups-1)/numNUMANodes + 1 484 485 // Calculate the number of available 'cpuGroups' across all NUMA nodes as 486 // well as the number of 'cpuGroups' that need to be allocated (rounding up). 487 numCPUGroupsNeeded := (a.numCPUsNeeded-1)/cpuGroupSize + 1 488 489 // Calculate the minimum number of numa nodes required to satisfy the 490 // allocation (rounding up). 491 minNumNUMAs = (numCPUGroupsNeeded-1)/numCPUGroupsPerNUMANode + 1 492 493 // Calculate the maximum number of numa nodes required to satisfy the allocation. 494 maxNumNUMAs = min(numCPUGroupsNeeded, numNUMANodesAvailable) 495 496 return 497 } 498 499 // needsAtLeast returns true if and only if the accumulator needs at least `n` CPUs. 500 // This means that needsAtLeast returns true even if more than `n` CPUs are needed. 501 func (a *cpuAccumulator) needsAtLeast(n int) bool { 502 return a.numCPUsNeeded >= n 503 } 504 505 // isSatisfied returns true if and only if the accumulator has all the CPUs it needs. 506 func (a *cpuAccumulator) isSatisfied() bool { 507 return a.numCPUsNeeded < 1 508 } 509 510 // isFailed returns true if and only if there aren't enough available CPUs in the system. 511 // (e.g. the accumulator needs 4 CPUs but only 3 are available). 512 func (a *cpuAccumulator) isFailed() bool { 513 return a.numCPUsNeeded > a.details.CPUs().Size() 514 } 515 516 // iterateCombinations walks through all n-choose-k subsets of size k in n and 517 // calls function 'f()' on each subset. For example, if n={0,1,2}, and k=2, 518 // then f() will be called on the subsets {0,1}, {0,2}. and {1,2}. If f() ever 519 // returns 'Break', we break early and exit the loop. 520 func (a *cpuAccumulator) iterateCombinations(n []int, k int, f func([]int) LoopControl) { 521 if k < 1 { 522 return 523 } 524 525 var helper func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl 526 helper = func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl { 527 if k == 0 { 528 return f(accum) 529 } 530 for i := start; i <= len(n)-k; i++ { 531 control := helper(n, k-1, i+1, append(accum, n[i]), f) 532 if control == Break { 533 return Break 534 } 535 } 536 return Continue 537 } 538 539 helper(n, k, 0, []int{}, f) 540 } 541 542 // takeByTopologyNUMAPacked returns a CPUSet containing `numCPUs` CPUs taken from the CPUs in the 543 // set `availableCPUs`. `topo` describes how the CPUs are arranged between sockets, NUMA nodes 544 // and physical cores (if hyperthreading is on a "CPU" is a thread rather than a full physical 545 // core). 546 // 547 // If sockets are higher than NUMA nodes in the memory hierarchy (i.e. a socket contains more than 548 // one NUMA node), the CPUs are selected as follows. 549 // 550 // If `numCPUs` is bigger than the total number of CPUs in a socket, and there are free (i.e. all 551 // CPUs in them are free) sockets, the function takes as many entire free sockets as possible. 552 // If there are no free sockets, or `numCPUs` is less than a whole socket, or the remaining number 553 // of CPUs to take after having taken some whole sockets is less than a whole socket, the function 554 // tries to take whole NUMA nodes. 555 // 556 // If the remaining number of CPUs to take is bigger than the total number of CPUs in a NUMA node, 557 // and there are free (i.e. all CPUs in them are free) NUMA nodes, the function takes as many entire 558 // free NUMA nodes as possible. The free NUMA nodes are taken from one socket at a time, and the 559 // sockets are considered by ascending order of free CPUs in them. If there are no free NUMA nodes, 560 // or the remaining number of CPUs to take after having taken full sockets and NUMA nodes is less 561 // than a whole NUMA node, the function tries to take whole physical cores (cores). 562 // 563 // If `numCPUs` is bigger than the total number of CPUs in a core, and there are 564 // free (i.e. all CPUs in them are free) cores, the function takes as many entire free cores as possible. 565 // The cores are taken from one socket at a time, and the sockets are considered by 566 // ascending order of free CPUs in them. For a given socket, the cores are taken one NUMA node at a time, 567 // and the NUMA nodes are considered by ascending order of free CPUs in them. If there are no free 568 // cores, or the remaining number of CPUs to take after having taken full sockets, NUMA nodes and 569 // cores is less than a whole core, the function tries to take individual CPUs. 570 // 571 // The individual CPUs are taken from one socket at a time, and the sockets are considered by 572 // ascending order of free CPUs in them. For a given socket, the CPUs are taken one NUMA node at a time, 573 // and the NUMA nodes are considered by ascending order of free CPUs in them. For a given NUMA node, the 574 // CPUs are taken one core at a time, and the core are considered by ascending order of free CPUs in them. 575 // 576 // If NUMA nodes are higher than Sockets in the memory hierarchy (i.e. a NUMA node contains more 577 // than one socket), the CPUs are selected as written above, with the only differences being that 578 // (1) the order with which full sockets and full NUMA nodes are acquired is swapped, and (2) the 579 // order with which lower-level topology elements are selected is also swapped accordingly. E.g. 580 // when selecting full cores, the cores are selected starting from the ones in the NUMA node with 581 // the least amount of free CPUs to the one with the highest amount of free CPUs (i.e. in ascending 582 // order of free CPUs). For any NUMA node, the cores are selected from the ones in the socket with 583 // the least amount of free CPUs to the one with the highest amount of free CPUs. 584 func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) { 585 acc := newCPUAccumulator(topo, availableCPUs, numCPUs) 586 if acc.isSatisfied() { 587 return acc.result, nil 588 } 589 if acc.isFailed() { 590 return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size()) 591 } 592 593 // Algorithm: topology-aware best-fit 594 // 1. Acquire whole NUMA nodes and sockets, if available and the container 595 // requires at least a NUMA node or socket's-worth of CPUs. If NUMA 596 // Nodes map to 1 or more sockets, pull from NUMA nodes first. 597 // Otherwise pull from sockets first. 598 acc.numaOrSocketsFirst.takeFullFirstLevel() 599 if acc.isSatisfied() { 600 return acc.result, nil 601 } 602 acc.numaOrSocketsFirst.takeFullSecondLevel() 603 if acc.isSatisfied() { 604 return acc.result, nil 605 } 606 607 // 2. Acquire whole cores, if available and the container requires at least 608 // a core's-worth of CPUs. 609 acc.takeFullCores() 610 if acc.isSatisfied() { 611 return acc.result, nil 612 } 613 614 // 3. Acquire single threads, preferring to fill partially-allocated cores 615 // on the same sockets as the whole cores we have already taken in this 616 // allocation. 617 acc.takeRemainingCPUs() 618 if acc.isSatisfied() { 619 return acc.result, nil 620 } 621 622 return cpuset.New(), fmt.Errorf("failed to allocate cpus") 623 } 624 625 // takeByTopologyNUMADistributed returns a CPUSet of size 'numCPUs'. 626 // 627 // It generates this CPUset by allocating CPUs from 'availableCPUs' according 628 // to the algorithm outlined in KEP-2902: 629 // 630 // https://github.com/kubernetes/enhancements/tree/e7f51ffbe2ee398ffd1fba4a6d854f276bfad9fb/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option 631 // 632 // This algorithm evenly distribute CPUs across NUMA nodes in cases where more 633 // than one NUMA node is required to satisfy the allocation. This is in 634 // contrast to the takeByTopologyNUMAPacked algorithm, which attempts to 'pack' 635 // CPUs onto NUMA nodes and fill them up before moving on to the next one. 636 // 637 // At a high-level this algorithm can be summarized as: 638 // 639 // For each NUMA single node: 640 // - If all requested CPUs can be allocated from this NUMA node; 641 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 642 // available CPUs in that NUMA node and return 643 // 644 // Otherwise, for each pair of NUMA nodes: 645 // - If the set of requested CPUs (modulo 2) can be evenly split across 646 // the 2 NUMA nodes; AND 647 // - Any remaining CPUs (after the modulo operation) can be striped across 648 // some subset of the NUMA nodes; 649 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 650 // available CPUs in both NUMA nodes and return 651 // 652 // Otherwise, for each 3-tuple of NUMA nodes: 653 // - If the set of requested CPUs (modulo 3) can be evenly distributed 654 // across the 3 NUMA nodes; AND 655 // - Any remaining CPUs (after the modulo operation) can be striped across 656 // some subset of the NUMA nodes; 657 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 658 // available CPUs in all three NUMA nodes and return 659 // 660 // ... 661 // 662 // Otherwise, for the set of all NUMA nodes: 663 // - If the set of requested CPUs (modulo NUM_NUMA_NODES) can be evenly 664 // distributed across all NUMA nodes; AND 665 // - Any remaining CPUs (after the modulo operation) can be striped across 666 // some subset of the NUMA nodes; 667 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 668 // available CPUs in all NUMA nodes and return 669 // 670 // If none of the above conditions can be met, then resort back to a 671 // best-effort fit of packing CPUs into NUMA nodes by calling 672 // takeByTopologyNUMAPacked() over all available CPUs. 673 // 674 // NOTE: A "balance score" will be calculated to help find the best subset of 675 // NUMA nodes to allocate any 'remainder' CPUs from (in cases where the total 676 // number of CPUs to allocate cannot be evenly distributed across the chosen 677 // set of NUMA nodes). This "balance score" is calculated as the standard 678 // deviation of how many CPUs will be available on each NUMA node after all 679 // evenly distributed and remainder CPUs are allocated. The subset with the 680 // lowest "balance score" will receive the CPUs in order to keep the overall 681 // allocation of CPUs as "balanced" as possible. 682 // 683 // NOTE: This algorithm has been generalized to take an additional 684 // 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups 685 // of size 'cpuGroupSize' according to the algorithm described above. This is 686 // important, for example, to ensure that all CPUs (i.e. all hyperthreads) from 687 // a single core are allocated together. 688 func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) { 689 // If the number of CPUs requested cannot be handed out in chunks of 690 // 'cpuGroupSize', then we just call out the packing algorithm since we 691 // can't distribute CPUs in this chunk size. 692 if (numCPUs % cpuGroupSize) != 0 { 693 return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs) 694 } 695 696 // Otherwise build an accumulator to start allocating CPUs from. 697 acc := newCPUAccumulator(topo, availableCPUs, numCPUs) 698 if acc.isSatisfied() { 699 return acc.result, nil 700 } 701 if acc.isFailed() { 702 return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size()) 703 } 704 705 // Get the list of NUMA nodes represented by the set of CPUs in 'availableCPUs'. 706 numas := acc.sortAvailableNUMANodes() 707 708 // Calculate the minimum and maximum possible number of NUMA nodes that 709 // could satisfy this request. This is used to optimize how many iterations 710 // of the loop we need to go through below. 711 minNUMAs, maxNUMAs := acc.rangeNUMANodesNeededToSatisfy(cpuGroupSize) 712 713 // Try combinations of 1,2,3,... NUMA nodes until we find a combination 714 // where we can evenly distribute CPUs across them. To optimize things, we 715 // don't always start at 1 and end at len(numas). Instead, we use the 716 // values of 'minNUMAs' and 'maxNUMAs' calculated above. 717 for k := minNUMAs; k <= maxNUMAs; k++ { 718 // Iterate through the various n-choose-k NUMA node combinations, 719 // looking for the combination of NUMA nodes that can best have CPUs 720 // distributed across them. 721 var bestBalance float64 = math.MaxFloat64 722 var bestRemainder []int = nil 723 var bestCombo []int = nil 724 acc.iterateCombinations(numas, k, func(combo []int) LoopControl { 725 // If we've already found a combo with a balance of 0 in a 726 // different iteration, then don't bother checking any others. 727 if bestBalance == 0 { 728 return Break 729 } 730 731 // Check that this combination of NUMA nodes has enough CPUs to 732 // satisfy the allocation overall. 733 cpus := acc.details.CPUsInNUMANodes(combo...) 734 if cpus.Size() < numCPUs { 735 return Continue 736 } 737 738 // Check that CPUs can be handed out in groups of size 739 // 'cpuGroupSize' across the NUMA nodes in this combo. 740 numCPUGroups := 0 741 for _, numa := range combo { 742 numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize) 743 } 744 if (numCPUGroups * cpuGroupSize) < numCPUs { 745 return Continue 746 } 747 748 // Check that each NUMA node in this combination can allocate an 749 // even distribution of CPUs in groups of size 'cpuGroupSize', 750 // modulo some remainder. 751 distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize 752 for _, numa := range combo { 753 cpus := acc.details.CPUsInNUMANodes(numa) 754 if cpus.Size() < distribution { 755 return Continue 756 } 757 } 758 759 // Calculate how many CPUs will be available on each NUMA node in 760 // the system after allocating an even distribution of CPU groups 761 // of size 'cpuGroupSize' from each NUMA node in 'combo'. This will 762 // be used in the "balance score" calculation to help decide if 763 // this combo should ultimately be chosen. 764 availableAfterAllocation := make(mapIntInt, len(numas)) 765 for _, numa := range numas { 766 availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size() 767 } 768 for _, numa := range combo { 769 availableAfterAllocation[numa] -= distribution 770 } 771 772 // Check if there are any remaining CPUs to distribute across the 773 // NUMA nodes once CPUs have been evenly distributed in groups of 774 // size 'cpuGroupSize'. 775 remainder := numCPUs - (distribution * len(combo)) 776 777 // Get a list of NUMA nodes to consider pulling the remainder CPUs 778 // from. This list excludes NUMA nodes that don't have at least 779 // 'cpuGroupSize' CPUs available after being allocated 780 // 'distribution' number of CPUs. 781 var remainderCombo []int 782 for _, numa := range combo { 783 if availableAfterAllocation[numa] >= cpuGroupSize { 784 remainderCombo = append(remainderCombo, numa) 785 } 786 } 787 788 // Declare a set of local variables to help track the "balance 789 // scores" calculated when using different subsets of 790 // 'remainderCombo' to allocate remainder CPUs from. 791 var bestLocalBalance float64 = math.MaxFloat64 792 var bestLocalRemainder []int = nil 793 794 // If there aren't any remainder CPUs to allocate, then calculate 795 // the "balance score" of this combo as the standard deviation of 796 // the values contained in 'availableAfterAllocation'. 797 if remainder == 0 { 798 bestLocalBalance = standardDeviation(availableAfterAllocation.Values()) 799 bestLocalRemainder = nil 800 } 801 802 // Otherwise, find the best "balance score" when allocating the 803 // remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'. 804 // These remainder CPUs are handed out in groups of size 'cpuGroupSize'. 805 // We start from k=len(remainderCombo) and walk down to k=1 so that 806 // we continue to distribute CPUs as much as possible across 807 // multiple NUMA nodes. 808 for k := len(remainderCombo); remainder > 0 && k >= 1; k-- { 809 acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl { 810 // Make a local copy of 'remainder'. 811 remainder := remainder 812 813 // Make a local copy of 'availableAfterAllocation'. 814 availableAfterAllocation := availableAfterAllocation.Clone() 815 816 // If this subset is not capable of allocating all 817 // remainder CPUs, continue to the next one. 818 if sum(availableAfterAllocation.Values(subset...)) < remainder { 819 return Continue 820 } 821 822 // For all NUMA nodes in 'subset', walk through them, 823 // removing 'cpuGroupSize' number of CPUs from each 824 // until all remainder CPUs have been accounted for. 825 for remainder > 0 { 826 for _, numa := range subset { 827 if remainder == 0 { 828 break 829 } 830 if availableAfterAllocation[numa] < cpuGroupSize { 831 continue 832 } 833 availableAfterAllocation[numa] -= cpuGroupSize 834 remainder -= cpuGroupSize 835 } 836 } 837 838 // Calculate the "balance score" as the standard deviation 839 // of the number of CPUs available on all NUMA nodes in the 840 // system after the remainder CPUs have been allocated 841 // across 'subset' in groups of size 'cpuGroupSize'. 842 balance := standardDeviation(availableAfterAllocation.Values()) 843 if balance < bestLocalBalance { 844 bestLocalBalance = balance 845 bestLocalRemainder = subset 846 } 847 848 return Continue 849 }) 850 } 851 852 // If the best "balance score" for this combo is less than the 853 // lowest "balance score" of all previous combos, then update this 854 // combo (and remainder set) to be the best one found so far. 855 if bestLocalBalance < bestBalance { 856 bestBalance = bestLocalBalance 857 bestRemainder = bestLocalRemainder 858 bestCombo = combo 859 } 860 861 return Continue 862 }) 863 864 // If we made it through all of the iterations above without finding a 865 // combination of NUMA nodes that can properly balance CPU allocations, 866 // then move on to the next larger set of NUMA node combinations. 867 if bestCombo == nil { 868 continue 869 } 870 871 // Otherwise, start allocating CPUs from the NUMA node combination 872 // chosen. First allocate an even distribution of CPUs in groups of 873 // size 'cpuGroupSize' from 'bestCombo'. 874 distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize 875 for _, numa := range bestCombo { 876 cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution) 877 acc.take(cpus) 878 } 879 880 // Then allocate any remaining CPUs in groups of size 'cpuGroupSize' 881 // from each NUMA node in the remainder set. 882 remainder := numCPUs - (distribution * len(bestCombo)) 883 for remainder > 0 { 884 for _, numa := range bestRemainder { 885 if remainder == 0 { 886 break 887 } 888 if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize { 889 continue 890 } 891 cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize) 892 acc.take(cpus) 893 remainder -= cpuGroupSize 894 } 895 } 896 897 // If we haven't allocated all of our CPUs at this point, then something 898 // went wrong in our accounting and we should error out. 899 if acc.numCPUsNeeded > 0 { 900 return cpuset.New(), fmt.Errorf("accounting error, not enough CPUs allocated, remaining: %v", acc.numCPUsNeeded) 901 } 902 903 // Likewise, if we have allocated too many CPUs at this point, then something 904 // went wrong in our accounting and we should error out. 905 if acc.numCPUsNeeded < 0 { 906 return cpuset.New(), fmt.Errorf("accounting error, too many CPUs allocated, remaining: %v", acc.numCPUsNeeded) 907 } 908 909 // Otherwise, return the result 910 return acc.result, nil 911 } 912 913 // If we never found a combination of NUMA nodes that we could properly 914 // distribute CPUs across, fall back to the packing algorithm. 915 return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs) 916 } 917