cpu_assignment.go

Documentation: k8s.io/kubernetes/pkg/kubelet/cm/cpumanager

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cpumanager
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  
    24  	"k8s.io/klog/v2"
    25  
    26  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
    27  	"k8s.io/utils/cpuset"
    28  )
    29  
    30  // LoopControl controls the behavior of the cpu accumulator loop logic
    31  type LoopControl int
    32  
    33  // Possible loop control outcomes
    34  const (
    35  	Continue LoopControl = iota
    36  	Break
    37  )
    38  
    39  type mapIntInt map[int]int
    40  
    41  func (m mapIntInt) Clone() mapIntInt {
    42  	cp := make(mapIntInt, len(m))
    43  	for k, v := range m {
    44  		cp[k] = v
    45  	}
    46  	return cp
    47  }
    48  
    49  func (m mapIntInt) Keys() []int {
    50  	var keys []int
    51  	for k := range m {
    52  		keys = append(keys, k)
    53  	}
    54  	return keys
    55  }
    56  
    57  func (m mapIntInt) Values(keys ...int) []int {
    58  	if keys == nil {
    59  		keys = m.Keys()
    60  	}
    61  	var values []int
    62  	for _, k := range keys {
    63  		values = append(values, m[k])
    64  	}
    65  	return values
    66  }
    67  
    68  func sum(xs []int) int {
    69  	var s int
    70  	for _, x := range xs {
    71  		s += x
    72  	}
    73  	return s
    74  }
    75  
    76  func mean(xs []int) float64 {
    77  	var sum float64
    78  	for _, x := range xs {
    79  		sum += float64(x)
    80  	}
    81  	m := sum / float64(len(xs))
    82  	return math.Round(m*1000) / 1000
    83  }
    84  
    85  func standardDeviation(xs []int) float64 {
    86  	m := mean(xs)
    87  	var sum float64
    88  	for _, x := range xs {
    89  		sum += (float64(x) - m) * (float64(x) - m)
    90  	}
    91  	s := math.Sqrt(sum / float64(len(xs)))
    92  	return math.Round(s*1000) / 1000
    93  }
    94  
    95  type numaOrSocketsFirstFuncs interface {
    96  	takeFullFirstLevel()
    97  	takeFullSecondLevel()
    98  	sortAvailableNUMANodes() []int
    99  	sortAvailableSockets() []int
   100  	sortAvailableCores() []int
   101  }
   102  
   103  type numaFirst struct{ acc *cpuAccumulator }
   104  type socketsFirst struct{ acc *cpuAccumulator }
   105  
   106  var _ numaOrSocketsFirstFuncs = (*numaFirst)(nil)
   107  var _ numaOrSocketsFirstFuncs = (*socketsFirst)(nil)
   108  
   109  // If NUMA nodes are higher in the memory hierarchy than sockets, then we take
   110  // from the set of NUMA Nodes as the first level.
   111  func (n *numaFirst) takeFullFirstLevel() {
   112  	n.acc.takeFullNUMANodes()
   113  }
   114  
   115  // If NUMA nodes are higher in the memory hierarchy than sockets, then we take
   116  // from the set of sockets as the second level.
   117  func (n *numaFirst) takeFullSecondLevel() {
   118  	n.acc.takeFullSockets()
   119  }
   120  
   121  // If NUMA nodes are higher in the memory hierarchy than sockets, then just
   122  // sort the NUMA nodes directly, and return them.
   123  func (n *numaFirst) sortAvailableNUMANodes() []int {
   124  	numas := n.acc.details.NUMANodes().UnsortedList()
   125  	n.acc.sort(numas, n.acc.details.CPUsInNUMANodes)
   126  	return numas
   127  }
   128  
   129  // If NUMA nodes are higher in the memory hierarchy than sockets, then we need
   130  // to pull the set of sockets out of each sorted NUMA node, and accumulate the
   131  // partial order across them.
   132  func (n *numaFirst) sortAvailableSockets() []int {
   133  	var result []int
   134  	for _, numa := range n.sortAvailableNUMANodes() {
   135  		sockets := n.acc.details.SocketsInNUMANodes(numa).UnsortedList()
   136  		n.acc.sort(sockets, n.acc.details.CPUsInSockets)
   137  		result = append(result, sockets...)
   138  	}
   139  	return result
   140  }
   141  
   142  // If NUMA nodes are higher in the memory hierarchy than sockets, then
   143  // cores sit directly below sockets in the memory hierarchy.
   144  func (n *numaFirst) sortAvailableCores() []int {
   145  	var result []int
   146  	for _, socket := range n.acc.sortAvailableSockets() {
   147  		cores := n.acc.details.CoresInSockets(socket).UnsortedList()
   148  		n.acc.sort(cores, n.acc.details.CPUsInCores)
   149  		result = append(result, cores...)
   150  	}
   151  	return result
   152  }
   153  
   154  // If sockets are higher in the memory hierarchy than NUMA nodes, then we take
   155  // from the set of sockets as the first level.
   156  func (s *socketsFirst) takeFullFirstLevel() {
   157  	s.acc.takeFullSockets()
   158  }
   159  
   160  // If sockets are higher in the memory hierarchy than NUMA nodes, then we take
   161  // from the set of NUMA Nodes as the second level.
   162  func (s *socketsFirst) takeFullSecondLevel() {
   163  	s.acc.takeFullNUMANodes()
   164  }
   165  
   166  // If sockets are higher in the memory hierarchy than NUMA nodes, then we need
   167  // to pull the set of NUMA nodes out of each sorted Socket, and accumulate the
   168  // partial order across them.
   169  func (s *socketsFirst) sortAvailableNUMANodes() []int {
   170  	var result []int
   171  	for _, socket := range s.sortAvailableSockets() {
   172  		numas := s.acc.details.NUMANodesInSockets(socket).UnsortedList()
   173  		s.acc.sort(numas, s.acc.details.CPUsInNUMANodes)
   174  		result = append(result, numas...)
   175  	}
   176  	return result
   177  }
   178  
   179  // If sockets are higher in the memory hierarchy than NUMA nodes, then just
   180  // sort the sockets directly, and return them.
   181  func (s *socketsFirst) sortAvailableSockets() []int {
   182  	sockets := s.acc.details.Sockets().UnsortedList()
   183  	s.acc.sort(sockets, s.acc.details.CPUsInSockets)
   184  	return sockets
   185  }
   186  
   187  // If sockets are higher in the memory hierarchy than NUMA nodes, then cores
   188  // sit directly below NUMA Nodes in the memory hierarchy.
   189  func (s *socketsFirst) sortAvailableCores() []int {
   190  	var result []int
   191  	for _, numa := range s.acc.sortAvailableNUMANodes() {
   192  		cores := s.acc.details.CoresInNUMANodes(numa).UnsortedList()
   193  		s.acc.sort(cores, s.acc.details.CPUsInCores)
   194  		result = append(result, cores...)
   195  	}
   196  	return result
   197  }
   198  
   199  type cpuAccumulator struct {
   200  	// `topo` describes the layout of CPUs (i.e. hyper-threads if hyperthreading is on) between
   201  	// cores (i.e. physical CPUs if hyper-threading is on), NUMA nodes, and sockets on the K8s
   202  	// cluster node. `topo` is never mutated, meaning that as the cpuAccumulator claims CPUs topo is
   203  	// not modified. Its primary purpose is being a reference of the original (i.e. at the time the
   204  	// cpuAccumulator was created) topology to learn things such as how many CPUs are on each
   205  	// socket, NUMA node, etc... .
   206  	topo *topology.CPUTopology
   207  
   208  	// `details` is the set of free CPUs that the cpuAccumulator can claim to accumulate the desired
   209  	// number of CPUS. When a CPU is claimed, it's removed from `details`.
   210  	details topology.CPUDetails
   211  
   212  	// `numCPUsNeeded` is the number of CPUs that the accumulator still needs to accumulate to reach
   213  	// the desired number of CPUs. When the cpuAccumulator is created, `numCPUsNeeded` is set to the
   214  	// total number of CPUs to accumulate. Every time a CPU is claimed, `numCPUsNeeded` is decreased
   215  	// by 1 until it has value 0, meaning that all the needed CPUs have been accumulated
   216  	// (success), or a situation where it's bigger than 0 but no more CPUs are available is reached
   217  	// (failure).
   218  	numCPUsNeeded int
   219  
   220  	// `result` is the set of CPUs that have been accumulated so far. When a CPU is claimed, it's
   221  	// added to `result`. The cpuAccumulator completed its duty successfully when `result` has
   222  	// cardinality equal to the total number of CPUs to accumulate.
   223  	result cpuset.CPUSet
   224  
   225  	numaOrSocketsFirst numaOrSocketsFirstFuncs
   226  }
   227  
   228  func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
   229  	acc := &cpuAccumulator{
   230  		topo:          topo,
   231  		details:       topo.CPUDetails.KeepOnly(availableCPUs),
   232  		numCPUsNeeded: numCPUs,
   233  		result:        cpuset.New(),
   234  	}
   235  
   236  	if topo.NumSockets >= topo.NumNUMANodes {
   237  		acc.numaOrSocketsFirst = &numaFirst{acc}
   238  	} else {
   239  		acc.numaOrSocketsFirst = &socketsFirst{acc}
   240  	}
   241  
   242  	return acc
   243  }
   244  
   245  // Returns true if the supplied NUMANode is fully available in `a.details`.
   246  // "fully available" means that all the CPUs in it are free.
   247  func (a *cpuAccumulator) isNUMANodeFree(numaID int) bool {
   248  	return a.details.CPUsInNUMANodes(numaID).Size() == a.topo.CPUDetails.CPUsInNUMANodes(numaID).Size()
   249  }
   250  
   251  // Returns true if the supplied socket is fully available in `a.details`.
   252  // "fully available" means that all the CPUs in it are free.
   253  func (a *cpuAccumulator) isSocketFree(socketID int) bool {
   254  	return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket()
   255  }
   256  
   257  // Returns true if the supplied core is fully available in `a.details`.
   258  // "fully available" means that all the CPUs in it are free.
   259  func (a *cpuAccumulator) isCoreFree(coreID int) bool {
   260  	return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore()
   261  }
   262  
   263  // Returns free NUMA Node IDs as a slice sorted by sortAvailableNUMANodes().
   264  func (a *cpuAccumulator) freeNUMANodes() []int {
   265  	free := []int{}
   266  	for _, numa := range a.sortAvailableNUMANodes() {
   267  		if a.isNUMANodeFree(numa) {
   268  			free = append(free, numa)
   269  		}
   270  	}
   271  	return free
   272  }
   273  
   274  // Returns free socket IDs as a slice sorted by sortAvailableSockets().
   275  func (a *cpuAccumulator) freeSockets() []int {
   276  	free := []int{}
   277  	for _, socket := range a.sortAvailableSockets() {
   278  		if a.isSocketFree(socket) {
   279  			free = append(free, socket)
   280  		}
   281  	}
   282  	return free
   283  }
   284  
   285  // Returns free core IDs as a slice sorted by sortAvailableCores().
   286  func (a *cpuAccumulator) freeCores() []int {
   287  	free := []int{}
   288  	for _, core := range a.sortAvailableCores() {
   289  		if a.isCoreFree(core) {
   290  			free = append(free, core)
   291  		}
   292  	}
   293  	return free
   294  }
   295  
   296  // Returns free CPU IDs as a slice sorted by sortAvailableCPUs().
   297  func (a *cpuAccumulator) freeCPUs() []int {
   298  	return a.sortAvailableCPUs()
   299  }
   300  
   301  // Sorts the provided list of NUMA nodes/sockets/cores/cpus referenced in 'ids'
   302  // by the number of available CPUs contained within them (smallest to largest).
   303  // The 'getCPU()' parameter defines the function that should be called to
   304  // retrieve the list of available CPUs for the type being referenced. If two
   305  // NUMA nodes/sockets/cores/cpus have the same number of available CPUs, they
   306  // are sorted in ascending order by their id.
   307  func (a *cpuAccumulator) sort(ids []int, getCPUs func(ids ...int) cpuset.CPUSet) {
   308  	sort.Slice(ids,
   309  		func(i, j int) bool {
   310  			iCPUs := getCPUs(ids[i])
   311  			jCPUs := getCPUs(ids[j])
   312  			if iCPUs.Size() < jCPUs.Size() {
   313  				return true
   314  			}
   315  			if iCPUs.Size() > jCPUs.Size() {
   316  				return false
   317  			}
   318  			return ids[i] < ids[j]
   319  		})
   320  }
   321  
   322  // Sort all NUMA nodes with at least one free CPU.
   323  //
   324  // If NUMA nodes are higher than sockets in the memory hierarchy, they are sorted by ascending number
   325  // of free CPUs that they contain. "higher than sockets in the memory hierarchy" means that NUMA nodes
   326  // contain a bigger number of CPUs (free and busy) than sockets, or equivalently that each NUMA node
   327  // contains more than one socket.
   328  //
   329  // If instead NUMA nodes are lower in the memory hierarchy than sockets, they are sorted as follows.
   330  // First, they are sorted by number of free CPUs in the sockets that contain them. Then, for each
   331  // socket they are sorted by number of free CPUs that they contain. The order is always ascending.
   332  // In other words, the relative order of two NUMA nodes is determined as follows:
   333  //  1. If the two NUMA nodes belong to different sockets, the NUMA node in the socket with the
   334  //     smaller amount of free CPUs appears first.
   335  //  2. If the two NUMA nodes belong to the same socket, the NUMA node with the smaller amount of free
   336  //     CPUs appears first.
   337  func (a *cpuAccumulator) sortAvailableNUMANodes() []int {
   338  	return a.numaOrSocketsFirst.sortAvailableNUMANodes()
   339  }
   340  
   341  // Sort all sockets with at least one free CPU.
   342  //
   343  // If sockets are higher than NUMA nodes in the memory hierarchy, they are sorted by ascending number
   344  // of free CPUs that they contain. "higher than NUMA nodes in the memory hierarchy" means that
   345  // sockets contain a bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each
   346  // socket contains more than one NUMA node.
   347  //
   348  // If instead sockets are lower in the memory hierarchy than NUMA nodes, they are sorted as follows.
   349  // First, they are sorted by number of free CPUs in the NUMA nodes that contain them. Then, for each
   350  // NUMA node they are sorted by number of free CPUs that they contain. The order is always ascending.
   351  // In other words, the relative order of two sockets is determined as follows:
   352  //  1. If the two sockets belong to different NUMA nodes, the socket in the NUMA node with the
   353  //     smaller amount of free CPUs appears first.
   354  //  2. If the two sockets belong to the same NUMA node, the socket with the smaller amount of free
   355  //     CPUs appears first.
   356  func (a *cpuAccumulator) sortAvailableSockets() []int {
   357  	return a.numaOrSocketsFirst.sortAvailableSockets()
   358  }
   359  
   360  // Sort all cores with at least one free CPU.
   361  //
   362  // If sockets are higher in the memory hierarchy than NUMA nodes, meaning that sockets contain a
   363  // bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each socket contains
   364  // more than one NUMA node, the cores are sorted as follows. First, they are sorted by number of
   365  // free CPUs that their sockets contain. Then, for each socket, the cores in it are sorted by number
   366  // of free CPUs that their NUMA nodes contain. Then, for each NUMA node, the cores in it are sorted
   367  // by number of free CPUs that they contain. The order is always ascending. In other words, the
   368  // relative order of two cores is determined as follows:
   369  //  1. If the two cores belong to different sockets, the core in the socket with the smaller amount of
   370  //     free CPUs appears first.
   371  //  2. If the two cores belong to the same socket but different NUMA nodes, the core in the NUMA node
   372  //     with the smaller amount of free CPUs appears first.
   373  //  3. If the two cores belong to the same NUMA node and socket, the core with the smaller amount of
   374  //     free CPUs appears first.
   375  //
   376  // If instead NUMA nodes are higher in the memory hierarchy than sockets, the sorting happens in the
   377  // same way as described in the previous paragraph, except that the priority of NUMA nodes and
   378  // sockets is inverted (e.g. first sort the cores by number of free CPUs in their NUMA nodes, then,
   379  // for each NUMA node, sort the cores by number of free CPUs in their sockets, etc...).
   380  func (a *cpuAccumulator) sortAvailableCores() []int {
   381  	return a.numaOrSocketsFirst.sortAvailableCores()
   382  }
   383  
   384  // Sort all free CPUs.
   385  //
   386  // If sockets are higher in the memory hierarchy than NUMA nodes, meaning that sockets contain a
   387  // bigger number of CPUs (free and busy) than NUMA nodes, or equivalently that each socket contains
   388  // more than one NUMA node, the CPUs are sorted as follows. First, they are sorted by number of
   389  // free CPUs that their sockets contain. Then, for each socket, the CPUs in it are sorted by number
   390  // of free CPUs that their NUMA nodes contain. Then, for each NUMA node, the CPUs in it are sorted
   391  // by number of free CPUs that their cores contain. Finally, for each core, the CPUs in it are
   392  // sorted by numerical ID. The order is always ascending. In other words, the relative order of two
   393  // CPUs is determined as follows:
   394  //  1. If the two CPUs belong to different sockets, the CPU in the socket with the smaller amount of
   395  //     free CPUs appears first.
   396  //  2. If the two CPUs belong to the same socket but different NUMA nodes, the CPU in the NUMA node
   397  //     with the smaller amount of free CPUs appears first.
   398  //  3. If the two CPUs belong to the same socket and NUMA node but different cores, the CPU in the
   399  //     core with the smaller amount of free CPUs appears first.
   400  //  4. If the two CPUs belong to the same NUMA node, socket, and core, the CPU with the smaller ID
   401  //     appears first.
   402  //
   403  // If instead NUMA nodes are higher in the memory hierarchy than sockets, the sorting happens in the
   404  // same way as described in the previous paragraph, except that the priority of NUMA nodes and
   405  // sockets is inverted (e.g. first sort the CPUs by number of free CPUs in their NUMA nodes, then,
   406  // for each NUMA node, sort the CPUs by number of free CPUs in their sockets, etc...).
   407  func (a *cpuAccumulator) sortAvailableCPUs() []int {
   408  	var result []int
   409  	for _, core := range a.sortAvailableCores() {
   410  		cpus := a.details.CPUsInCores(core).UnsortedList()
   411  		sort.Ints(cpus)
   412  		result = append(result, cpus...)
   413  	}
   414  	return result
   415  }
   416  
   417  func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
   418  	a.result = a.result.Union(cpus)
   419  	a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
   420  	a.numCPUsNeeded -= cpus.Size()
   421  }
   422  
   423  func (a *cpuAccumulator) takeFullNUMANodes() {
   424  	for _, numa := range a.freeNUMANodes() {
   425  		cpusInNUMANode := a.topo.CPUDetails.CPUsInNUMANodes(numa)
   426  		if !a.needsAtLeast(cpusInNUMANode.Size()) {
   427  			continue
   428  		}
   429  		klog.V(4).InfoS("takeFullNUMANodes: claiming NUMA node", "numa", numa)
   430  		a.take(cpusInNUMANode)
   431  	}
   432  }
   433  
   434  func (a *cpuAccumulator) takeFullSockets() {
   435  	for _, socket := range a.freeSockets() {
   436  		cpusInSocket := a.topo.CPUDetails.CPUsInSockets(socket)
   437  		if !a.needsAtLeast(cpusInSocket.Size()) {
   438  			continue
   439  		}
   440  		klog.V(4).InfoS("takeFullSockets: claiming socket", "socket", socket)
   441  		a.take(cpusInSocket)
   442  	}
   443  }
   444  
   445  func (a *cpuAccumulator) takeFullCores() {
   446  	for _, core := range a.freeCores() {
   447  		cpusInCore := a.topo.CPUDetails.CPUsInCores(core)
   448  		if !a.needsAtLeast(cpusInCore.Size()) {
   449  			continue
   450  		}
   451  		klog.V(4).InfoS("takeFullCores: claiming core", "core", core)
   452  		a.take(cpusInCore)
   453  	}
   454  }
   455  
   456  func (a *cpuAccumulator) takeRemainingCPUs() {
   457  	for _, cpu := range a.sortAvailableCPUs() {
   458  		klog.V(4).InfoS("takeRemainingCPUs: claiming CPU", "cpu", cpu)
   459  		a.take(cpuset.New(cpu))
   460  		if a.isSatisfied() {
   461  			return
   462  		}
   463  	}
   464  }
   465  
   466  // rangeNUMANodesNeededToSatisfy returns minimum and maximum (in this order) number of NUMA nodes
   467  // needed to satisfy the cpuAccumulator's goal of accumulating `a.numCPUsNeeded` CPUs, assuming that
   468  // CPU groups have size given by the `cpuGroupSize` argument.
   469  func (a *cpuAccumulator) rangeNUMANodesNeededToSatisfy(cpuGroupSize int) (minNumNUMAs, maxNumNUMAs int) {
   470  	// Get the total number of NUMA nodes in the system.
   471  	numNUMANodes := a.topo.CPUDetails.NUMANodes().Size()
   472  
   473  	// Get the total number of NUMA nodes that have CPUs available on them.
   474  	numNUMANodesAvailable := a.details.NUMANodes().Size()
   475  
   476  	// Get the total number of CPUs in the system.
   477  	numCPUs := a.topo.CPUDetails.CPUs().Size()
   478  
   479  	// Get the total number of 'cpuGroups' in the system.
   480  	numCPUGroups := (numCPUs-1)/cpuGroupSize + 1
   481  
   482  	// Calculate the number of 'cpuGroups' per NUMA Node in the system (rounding up).
   483  	numCPUGroupsPerNUMANode := (numCPUGroups-1)/numNUMANodes + 1
   484  
   485  	// Calculate the number of available 'cpuGroups' across all NUMA nodes as
   486  	// well as the number of 'cpuGroups' that need to be allocated (rounding up).
   487  	numCPUGroupsNeeded := (a.numCPUsNeeded-1)/cpuGroupSize + 1
   488  
   489  	// Calculate the minimum number of numa nodes required to satisfy the
   490  	// allocation (rounding up).
   491  	minNumNUMAs = (numCPUGroupsNeeded-1)/numCPUGroupsPerNUMANode + 1
   492  
   493  	// Calculate the maximum number of numa nodes required to satisfy the allocation.
   494  	maxNumNUMAs = min(numCPUGroupsNeeded, numNUMANodesAvailable)
   495  
   496  	return
   497  }
   498  
   499  // needsAtLeast returns true if and only if the accumulator needs at least `n` CPUs.
   500  // This means that needsAtLeast returns true even if more than `n` CPUs are needed.
   501  func (a *cpuAccumulator) needsAtLeast(n int) bool {
   502  	return a.numCPUsNeeded >= n
   503  }
   504  
   505  // isSatisfied returns true if and only if the accumulator has all the CPUs it needs.
   506  func (a *cpuAccumulator) isSatisfied() bool {
   507  	return a.numCPUsNeeded < 1
   508  }
   509  
   510  // isFailed returns true if and only if there aren't enough available CPUs in the system.
   511  // (e.g. the accumulator needs 4 CPUs but only 3 are available).
   512  func (a *cpuAccumulator) isFailed() bool {
   513  	return a.numCPUsNeeded > a.details.CPUs().Size()
   514  }
   515  
   516  // iterateCombinations walks through all n-choose-k subsets of size k in n and
   517  // calls function 'f()' on each subset. For example, if n={0,1,2}, and k=2,
   518  // then f() will be called on the subsets {0,1}, {0,2}. and {1,2}. If f() ever
   519  // returns 'Break', we break early and exit the loop.
   520  func (a *cpuAccumulator) iterateCombinations(n []int, k int, f func([]int) LoopControl) {
   521  	if k < 1 {
   522  		return
   523  	}
   524  
   525  	var helper func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl
   526  	helper = func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl {
   527  		if k == 0 {
   528  			return f(accum)
   529  		}
   530  		for i := start; i <= len(n)-k; i++ {
   531  			control := helper(n, k-1, i+1, append(accum, n[i]), f)
   532  			if control == Break {
   533  				return Break
   534  			}
   535  		}
   536  		return Continue
   537  	}
   538  
   539  	helper(n, k, 0, []int{}, f)
   540  }
   541  
   542  // takeByTopologyNUMAPacked returns a CPUSet containing `numCPUs` CPUs taken from the CPUs in the
   543  // set `availableCPUs`. `topo` describes how the CPUs are arranged between sockets, NUMA nodes
   544  // and physical cores (if hyperthreading is on a "CPU" is a thread rather than a full physical
   545  // core).
   546  //
   547  // If sockets are higher than NUMA nodes in the memory hierarchy (i.e. a socket contains more than
   548  // one NUMA node), the CPUs are selected as follows.
   549  //
   550  // If `numCPUs` is bigger than the total number of CPUs in a socket, and there are free (i.e. all
   551  // CPUs in them are free) sockets, the function takes as many entire free sockets as possible.
   552  // If there are no free sockets, or `numCPUs` is less than a whole socket, or the remaining number
   553  // of CPUs to take after having taken some whole sockets is less than a whole socket, the function
   554  // tries to take whole NUMA nodes.
   555  //
   556  // If the remaining number of CPUs to take is bigger than the total number of CPUs in a NUMA node,
   557  // and there are free (i.e. all CPUs in them are free) NUMA nodes, the function takes as many entire
   558  // free NUMA nodes as possible. The free NUMA nodes are taken from one socket at a time, and the
   559  // sockets are considered by ascending order of free CPUs in them. If there are no free NUMA nodes,
   560  // or the remaining number of CPUs to take after having taken full sockets and NUMA nodes is less
   561  // than a whole NUMA node, the function tries to take whole physical cores (cores).
   562  //
   563  // If `numCPUs` is bigger than the total number of CPUs in a core, and there are
   564  // free (i.e. all CPUs in them are free) cores, the function takes as many entire free cores as possible.
   565  // The cores are taken from one socket at a time, and the sockets are considered by
   566  // ascending order of free CPUs in them. For a given socket, the cores are taken one NUMA node at a time,
   567  // and the NUMA nodes are considered by ascending order of free CPUs in them. If there are no free
   568  // cores, or the remaining number of CPUs to take after having taken full sockets, NUMA nodes and
   569  // cores is less than a whole core, the function tries to take individual CPUs.
   570  //
   571  // The individual CPUs are taken from one socket at a time, and the sockets are considered by
   572  // ascending order of free CPUs in them. For a given socket, the CPUs are taken one NUMA node at a time,
   573  // and the NUMA nodes are considered by ascending order of free CPUs in them. For a given NUMA node, the
   574  // CPUs are taken one core at a time, and the core are considered by ascending order of free CPUs in them.
   575  //
   576  // If NUMA nodes are higher than Sockets in the memory hierarchy (i.e. a NUMA node contains more
   577  // than one socket), the CPUs are selected as written above, with the only differences being that
   578  // (1) the order with which full sockets and full NUMA nodes are acquired is swapped, and (2) the
   579  // order with which lower-level topology elements are selected is also swapped accordingly. E.g.
   580  // when selecting full cores, the cores are selected starting from the ones in the NUMA node with
   581  // the least amount of free CPUs to the one with the highest amount of free CPUs (i.e. in ascending
   582  // order of free CPUs). For any NUMA node, the cores are selected from the ones in the socket with
   583  // the least amount of free CPUs to the one with the highest amount of free CPUs.
   584  func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
   585  	acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
   586  	if acc.isSatisfied() {
   587  		return acc.result, nil
   588  	}
   589  	if acc.isFailed() {
   590  		return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size())
   591  	}
   592  
   593  	// Algorithm: topology-aware best-fit
   594  	// 1. Acquire whole NUMA nodes and sockets, if available and the container
   595  	//    requires at least a NUMA node or socket's-worth of CPUs. If NUMA
   596  	//    Nodes map to 1 or more sockets, pull from NUMA nodes first.
   597  	//    Otherwise pull from sockets first.
   598  	acc.numaOrSocketsFirst.takeFullFirstLevel()
   599  	if acc.isSatisfied() {
   600  		return acc.result, nil
   601  	}
   602  	acc.numaOrSocketsFirst.takeFullSecondLevel()
   603  	if acc.isSatisfied() {
   604  		return acc.result, nil
   605  	}
   606  
   607  	// 2. Acquire whole cores, if available and the container requires at least
   608  	//    a core's-worth of CPUs.
   609  	acc.takeFullCores()
   610  	if acc.isSatisfied() {
   611  		return acc.result, nil
   612  	}
   613  
   614  	// 3. Acquire single threads, preferring to fill partially-allocated cores
   615  	//    on the same sockets as the whole cores we have already taken in this
   616  	//    allocation.
   617  	acc.takeRemainingCPUs()
   618  	if acc.isSatisfied() {
   619  		return acc.result, nil
   620  	}
   621  
   622  	return cpuset.New(), fmt.Errorf("failed to allocate cpus")
   623  }
   624  
   625  // takeByTopologyNUMADistributed returns a CPUSet of size 'numCPUs'.
   626  //
   627  // It generates this CPUset by allocating CPUs from 'availableCPUs' according
   628  // to the algorithm outlined in KEP-2902:
   629  //
   630  // https://github.com/kubernetes/enhancements/tree/e7f51ffbe2ee398ffd1fba4a6d854f276bfad9fb/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option
   631  //
   632  // This algorithm evenly distribute CPUs across NUMA nodes in cases where more
   633  // than one NUMA node is required to satisfy the allocation. This is in
   634  // contrast to the takeByTopologyNUMAPacked algorithm, which attempts to 'pack'
   635  // CPUs onto NUMA nodes and fill them up before moving on to the next one.
   636  //
   637  // At a high-level this algorithm can be summarized as:
   638  //
   639  // For each NUMA single node:
   640  //   - If all requested CPUs can be allocated from this NUMA node;
   641  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   642  //     available CPUs in that NUMA node and return
   643  //
   644  // Otherwise, for each pair of NUMA nodes:
   645  //   - If the set of requested CPUs (modulo 2) can be evenly split across
   646  //     the 2 NUMA nodes; AND
   647  //   - Any remaining CPUs (after the modulo operation) can be striped across
   648  //     some subset of the NUMA nodes;
   649  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   650  //     available CPUs in both NUMA nodes and return
   651  //
   652  // Otherwise, for each 3-tuple of NUMA nodes:
   653  //   - If the set of requested CPUs (modulo 3) can be evenly distributed
   654  //     across the 3 NUMA nodes; AND
   655  //   - Any remaining CPUs (after the modulo operation) can be striped across
   656  //     some subset of the NUMA nodes;
   657  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   658  //     available CPUs in all three NUMA nodes and return
   659  //
   660  // ...
   661  //
   662  // Otherwise, for the set of all NUMA nodes:
   663  //   - If the set of requested CPUs (modulo NUM_NUMA_NODES) can be evenly
   664  //     distributed across all NUMA nodes; AND
   665  //   - Any remaining CPUs (after the modulo operation) can be striped across
   666  //     some subset of the NUMA nodes;
   667  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   668  //     available CPUs in all NUMA nodes and return
   669  //
   670  // If none of the above conditions can be met, then resort back to a
   671  // best-effort fit of packing CPUs into NUMA nodes by calling
   672  // takeByTopologyNUMAPacked() over all available CPUs.
   673  //
   674  // NOTE: A "balance score" will be calculated to help find the best subset of
   675  // NUMA nodes to allocate any 'remainder' CPUs from (in cases where the total
   676  // number of CPUs to allocate cannot be evenly distributed across the chosen
   677  // set of NUMA nodes). This "balance score" is calculated as the standard
   678  // deviation of how many CPUs will be available on each NUMA node after all
   679  // evenly distributed and remainder CPUs are allocated. The subset with the
   680  // lowest "balance score" will receive the CPUs in order to keep the overall
   681  // allocation of CPUs as "balanced" as possible.
   682  //
   683  // NOTE: This algorithm has been generalized to take an additional
   684  // 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups
   685  // of size 'cpuGroupSize' according to the algorithm described above. This is
   686  // important, for example, to ensure that all CPUs (i.e. all hyperthreads) from
   687  // a single core are allocated together.
   688  func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) {
   689  	// If the number of CPUs requested cannot be handed out in chunks of
   690  	// 'cpuGroupSize', then we just call out the packing algorithm since we
   691  	// can't distribute CPUs in this chunk size.
   692  	if (numCPUs % cpuGroupSize) != 0 {
   693  		return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs)
   694  	}
   695  
   696  	// Otherwise build an accumulator to start allocating CPUs from.
   697  	acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
   698  	if acc.isSatisfied() {
   699  		return acc.result, nil
   700  	}
   701  	if acc.isFailed() {
   702  		return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size())
   703  	}
   704  
   705  	// Get the list of NUMA nodes represented by the set of CPUs in 'availableCPUs'.
   706  	numas := acc.sortAvailableNUMANodes()
   707  
   708  	// Calculate the minimum and maximum possible number of NUMA nodes that
   709  	// could satisfy this request. This is used to optimize how many iterations
   710  	// of the loop we need to go through below.
   711  	minNUMAs, maxNUMAs := acc.rangeNUMANodesNeededToSatisfy(cpuGroupSize)
   712  
   713  	// Try combinations of 1,2,3,... NUMA nodes until we find a combination
   714  	// where we can evenly distribute CPUs across them. To optimize things, we
   715  	// don't always start at 1 and end at len(numas). Instead, we use the
   716  	// values of 'minNUMAs' and 'maxNUMAs' calculated above.
   717  	for k := minNUMAs; k <= maxNUMAs; k++ {
   718  		// Iterate through the various n-choose-k NUMA node combinations,
   719  		// looking for the combination of NUMA nodes that can best have CPUs
   720  		// distributed across them.
   721  		var bestBalance float64 = math.MaxFloat64
   722  		var bestRemainder []int = nil
   723  		var bestCombo []int = nil
   724  		acc.iterateCombinations(numas, k, func(combo []int) LoopControl {
   725  			// If we've already found a combo with a balance of 0 in a
   726  			// different iteration, then don't bother checking any others.
   727  			if bestBalance == 0 {
   728  				return Break
   729  			}
   730  
   731  			// Check that this combination of NUMA nodes has enough CPUs to
   732  			// satisfy the allocation overall.
   733  			cpus := acc.details.CPUsInNUMANodes(combo...)
   734  			if cpus.Size() < numCPUs {
   735  				return Continue
   736  			}
   737  
   738  			// Check that CPUs can be handed out in groups of size
   739  			// 'cpuGroupSize' across the NUMA nodes in this combo.
   740  			numCPUGroups := 0
   741  			for _, numa := range combo {
   742  				numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize)
   743  			}
   744  			if (numCPUGroups * cpuGroupSize) < numCPUs {
   745  				return Continue
   746  			}
   747  
   748  			// Check that each NUMA node in this combination can allocate an
   749  			// even distribution of CPUs in groups of size 'cpuGroupSize',
   750  			// modulo some remainder.
   751  			distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize
   752  			for _, numa := range combo {
   753  				cpus := acc.details.CPUsInNUMANodes(numa)
   754  				if cpus.Size() < distribution {
   755  					return Continue
   756  				}
   757  			}
   758  
   759  			// Calculate how many CPUs will be available on each NUMA node in
   760  			// the system after allocating an even distribution of CPU groups
   761  			// of size 'cpuGroupSize' from each NUMA node in 'combo'. This will
   762  			// be used in the "balance score" calculation to help decide if
   763  			// this combo should ultimately be chosen.
   764  			availableAfterAllocation := make(mapIntInt, len(numas))
   765  			for _, numa := range numas {
   766  				availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size()
   767  			}
   768  			for _, numa := range combo {
   769  				availableAfterAllocation[numa] -= distribution
   770  			}
   771  
   772  			// Check if there are any remaining CPUs to distribute across the
   773  			// NUMA nodes once CPUs have been evenly distributed in groups of
   774  			// size 'cpuGroupSize'.
   775  			remainder := numCPUs - (distribution * len(combo))
   776  
   777  			// Get a list of NUMA nodes to consider pulling the remainder CPUs
   778  			// from. This list excludes NUMA nodes that don't have at least
   779  			// 'cpuGroupSize' CPUs available after being allocated
   780  			// 'distribution' number of CPUs.
   781  			var remainderCombo []int
   782  			for _, numa := range combo {
   783  				if availableAfterAllocation[numa] >= cpuGroupSize {
   784  					remainderCombo = append(remainderCombo, numa)
   785  				}
   786  			}
   787  
   788  			// Declare a set of local variables to help track the "balance
   789  			// scores" calculated when using different subsets of
   790  			// 'remainderCombo' to allocate remainder CPUs from.
   791  			var bestLocalBalance float64 = math.MaxFloat64
   792  			var bestLocalRemainder []int = nil
   793  
   794  			// If there aren't any remainder CPUs to allocate, then calculate
   795  			// the "balance score" of this combo as the standard deviation of
   796  			// the values contained in 'availableAfterAllocation'.
   797  			if remainder == 0 {
   798  				bestLocalBalance = standardDeviation(availableAfterAllocation.Values())
   799  				bestLocalRemainder = nil
   800  			}
   801  
   802  			// Otherwise, find the best "balance score" when allocating the
   803  			// remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'.
   804  			// These remainder CPUs are handed out in groups of size 'cpuGroupSize'.
   805  			// We start from k=len(remainderCombo) and walk down to k=1 so that
   806  			// we continue to distribute CPUs as much as possible across
   807  			// multiple NUMA nodes.
   808  			for k := len(remainderCombo); remainder > 0 && k >= 1; k-- {
   809  				acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl {
   810  					// Make a local copy of 'remainder'.
   811  					remainder := remainder
   812  
   813  					// Make a local copy of 'availableAfterAllocation'.
   814  					availableAfterAllocation := availableAfterAllocation.Clone()
   815  
   816  					// If this subset is not capable of allocating all
   817  					// remainder CPUs, continue to the next one.
   818  					if sum(availableAfterAllocation.Values(subset...)) < remainder {
   819  						return Continue
   820  					}
   821  
   822  					// For all NUMA nodes in 'subset', walk through them,
   823  					// removing 'cpuGroupSize' number of CPUs from each
   824  					// until all remainder CPUs have been accounted for.
   825  					for remainder > 0 {
   826  						for _, numa := range subset {
   827  							if remainder == 0 {
   828  								break
   829  							}
   830  							if availableAfterAllocation[numa] < cpuGroupSize {
   831  								continue
   832  							}
   833  							availableAfterAllocation[numa] -= cpuGroupSize
   834  							remainder -= cpuGroupSize
   835  						}
   836  					}
   837  
   838  					// Calculate the "balance score" as the standard deviation
   839  					// of the number of CPUs available on all NUMA nodes in the
   840  					// system after the remainder CPUs have been allocated
   841  					// across 'subset' in groups of size 'cpuGroupSize'.
   842  					balance := standardDeviation(availableAfterAllocation.Values())
   843  					if balance < bestLocalBalance {
   844  						bestLocalBalance = balance
   845  						bestLocalRemainder = subset
   846  					}
   847  
   848  					return Continue
   849  				})
   850  			}
   851  
   852  			// If the best "balance score" for this combo is less than the
   853  			// lowest "balance score" of all previous combos, then update this
   854  			// combo (and remainder set) to be the best one found so far.
   855  			if bestLocalBalance < bestBalance {
   856  				bestBalance = bestLocalBalance
   857  				bestRemainder = bestLocalRemainder
   858  				bestCombo = combo
   859  			}
   860  
   861  			return Continue
   862  		})
   863  
   864  		// If we made it through all of the iterations above without finding a
   865  		// combination of NUMA nodes that can properly balance CPU allocations,
   866  		// then move on to the next larger set of NUMA node combinations.
   867  		if bestCombo == nil {
   868  			continue
   869  		}
   870  
   871  		// Otherwise, start allocating CPUs from the NUMA node combination
   872  		// chosen. First allocate an even distribution of CPUs in groups of
   873  		// size 'cpuGroupSize' from 'bestCombo'.
   874  		distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize
   875  		for _, numa := range bestCombo {
   876  			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution)
   877  			acc.take(cpus)
   878  		}
   879  
   880  		// Then allocate any remaining CPUs in groups of size 'cpuGroupSize'
   881  		// from each NUMA node in the remainder set.
   882  		remainder := numCPUs - (distribution * len(bestCombo))
   883  		for remainder > 0 {
   884  			for _, numa := range bestRemainder {
   885  				if remainder == 0 {
   886  					break
   887  				}
   888  				if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize {
   889  					continue
   890  				}
   891  				cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize)
   892  				acc.take(cpus)
   893  				remainder -= cpuGroupSize
   894  			}
   895  		}
   896  
   897  		// If we haven't allocated all of our CPUs at this point, then something
   898  		// went wrong in our accounting and we should error out.
   899  		if acc.numCPUsNeeded > 0 {
   900  			return cpuset.New(), fmt.Errorf("accounting error, not enough CPUs allocated, remaining: %v", acc.numCPUsNeeded)
   901  		}
   902  
   903  		// Likewise, if we have allocated too many CPUs at this point, then something
   904  		// went wrong in our accounting and we should error out.
   905  		if acc.numCPUsNeeded < 0 {
   906  			return cpuset.New(), fmt.Errorf("accounting error, too many CPUs allocated, remaining: %v", acc.numCPUsNeeded)
   907  		}
   908  
   909  		// Otherwise, return the result
   910  		return acc.result, nil
   911  	}
   912  
   913  	// If we never found a combination of NUMA nodes that we could properly
   914  	// distribute CPUs across, fall back to the packing algorithm.
   915  	return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs)
   916  }
   917
View as plain text