1
2
3
4
19
20 package kuberuntime
21
22 import (
23 "errors"
24 "fmt"
25 "math"
26 "os"
27 "path/filepath"
28 "strconv"
29 "sync"
30 "time"
31
32 "github.com/containerd/cgroups"
33 cadvisorv1 "github.com/google/cadvisor/info/v1"
34 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
35
36 v1 "k8s.io/api/core/v1"
37 "k8s.io/apimachinery/pkg/api/resource"
38 utilfeature "k8s.io/apiserver/pkg/util/feature"
39 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
40 "k8s.io/klog/v2"
41 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
42 kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
43 kubefeatures "k8s.io/kubernetes/pkg/features"
44 "k8s.io/kubernetes/pkg/kubelet/cm"
45 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
46 "k8s.io/kubernetes/pkg/kubelet/qos"
47 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
48 )
49
50 var defaultPageSize = int64(os.Getpagesize())
51
52
53 func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
54 enforceMemoryQoS := false
55
56 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
57 isCgroup2UnifiedMode() {
58 enforceMemoryQoS = true
59 }
60 cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
61 if err != nil {
62 return err
63 }
64 config.Linux = cl
65
66 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.UserNamespacesSupport) {
67 if cl.SecurityContext.NamespaceOptions.UsernsOptions != nil {
68 for _, mount := range config.Mounts {
69 mount.UidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Uids
70 mount.GidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Gids
71 }
72 }
73 }
74 return nil
75 }
76
77
78 func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) {
79 sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username)
80 if err != nil {
81 return nil, err
82 }
83 lc := &runtimeapi.LinuxContainerConfig{
84 Resources: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
85 SecurityContext: sc,
86 }
87
88 if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER {
89 lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET
90 lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID
91 }
92
93 return lc, nil
94 }
95
96
97 func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources {
98
99 var cpuRequest *resource.Quantity
100 if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists {
101 cpuRequest = container.Resources.Requests.Cpu()
102 }
103 lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory())
104
105 lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container,
106 int64(m.machineInfo.MemoryCapacity)))
107
108 lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
109
110
111 m.configureContainerSwapResources(lcr, pod, container)
112
113
114 if enforceMemoryQoS {
115 unified := map[string]string{}
116 memoryRequest := container.Resources.Requests.Memory().Value()
117 memoryLimit := container.Resources.Limits.Memory().Value()
118 if memoryRequest != 0 {
119 unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
120 }
121
122
123
124 if memoryRequest != memoryLimit {
125
126
127
128
129
130 memoryHigh := int64(0)
131 if memoryLimit != 0 {
132 memoryHigh = int64(math.Floor(
133 float64(memoryRequest)+
134 (float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
135 } else {
136 allocatable := m.getNodeAllocatable()
137 allocatableMemory, ok := allocatable[v1.ResourceMemory]
138 if ok && allocatableMemory.Value() > 0 {
139 memoryHigh = int64(math.Floor(
140 float64(memoryRequest)+
141 (float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
142 }
143 }
144 if memoryHigh != 0 && memoryHigh > memoryRequest {
145 unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
146 }
147 }
148 if len(unified) > 0 {
149 if lcr.Unified == nil {
150 lcr.Unified = unified
151 } else {
152 for k, v := range unified {
153 lcr.Unified[k] = v
154 }
155 }
156 klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified)
157 }
158 }
159
160 return lcr
161 }
162
163
164
165 func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
166 if !swapControllerAvailable() {
167 klog.InfoS("No swap cgroup controller present", "swapBehavior", m.memorySwapBehavior, "pod", klog.KObj(pod), "containerName", container.Name)
168 return
169 }
170 swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo)
171 if m.memorySwapBehavior == kubelettypes.LimitedSwap {
172 if !isCgroup2UnifiedMode() {
173 swapConfigurationHelper.ConfigureNoSwap(lcr)
174 return
175 }
176 }
177
178 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
179 swapConfigurationHelper.ConfigureNoSwap(lcr)
180 return
181 }
182
183
184
185 switch m.memorySwapBehavior {
186 case kubelettypes.NoSwap:
187 swapConfigurationHelper.ConfigureNoSwap(lcr)
188 case kubelettypes.LimitedSwap:
189 swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
190 default:
191 swapConfigurationHelper.ConfigureNoSwap(lcr)
192 }
193 }
194
195
196 func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources {
197 enforceMemoryQoS := false
198
199 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
200 isCgroup2UnifiedMode() {
201 enforceMemoryQoS = true
202 }
203 return &runtimeapi.ContainerResources{
204 Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
205 }
206 }
207
208
209 func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources {
210 resources := runtimeapi.LinuxContainerResources{}
211 var cpuShares int64
212
213 memLimit := memoryLimit.Value()
214
215
216
217
218 if cpuRequest == nil && cpuLimit != nil {
219 cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue()))
220 } else {
221
222
223 cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue()))
224 }
225 resources.CpuShares = cpuShares
226 if memLimit != 0 {
227 resources.MemoryLimitInBytes = memLimit
228 }
229
230 if m.cpuCFSQuota {
231
232
233 cpuPeriod := int64(quotaPeriod)
234 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
235
236
237 cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond)
238 }
239 cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod)
240 resources.CpuQuota = cpuQuota
241 resources.CpuPeriod = cpuPeriod
242 }
243
244
245 if isCgroup2UnifiedMode() {
246 resources.Unified = map[string]string{
247
248
249
250 "memory.oom.group": "1",
251 }
252 }
253 return &resources
254 }
255
256
257 func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtimeapi.HugepageLimit {
258 var hugepageLimits []*runtimeapi.HugepageLimit
259
260
261 for _, pageSize := range libcontainercgroups.HugePageSizes() {
262 hugepageLimits = append(hugepageLimits, &runtimeapi.HugepageLimit{
263 PageSize: pageSize,
264 Limit: uint64(0),
265 })
266 }
267
268 requiredHugepageLimits := map[string]uint64{}
269 for resourceObj, amountObj := range resources.Limits {
270 if !v1helper.IsHugePageResourceName(resourceObj) {
271 continue
272 }
273
274 pageSize, err := v1helper.HugePageSizeFromResourceName(resourceObj)
275 if err != nil {
276 klog.InfoS("Failed to get hugepage size from resource", "object", resourceObj, "err", err)
277 continue
278 }
279
280 sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize.Value())
281 if err != nil {
282 klog.InfoS("Size is invalid", "object", resourceObj, "err", err)
283 continue
284 }
285 requiredHugepageLimits[sizeString] = uint64(amountObj.Value())
286 }
287
288 for _, hugepageLimit := range hugepageLimits {
289 if limit, exists := requiredHugepageLimits[hugepageLimit.PageSize]; exists {
290 hugepageLimit.Limit = limit
291 }
292 }
293
294 return hugepageLimits
295 }
296
297 func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources {
298 var cStatusResources *kubecontainer.ContainerResources
299 runtimeStatusResources := statusResources.GetLinux()
300 if runtimeStatusResources != nil {
301 var cpuLimit, memLimit, cpuRequest *resource.Quantity
302 if runtimeStatusResources.CpuPeriod > 0 {
303 milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod)
304 if milliCPU > 0 {
305 cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
306 }
307 }
308 if runtimeStatusResources.CpuShares > 0 {
309 milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares)
310 if milliCPU > 0 {
311 cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
312 }
313 }
314 if runtimeStatusResources.MemoryLimitInBytes > 0 {
315 memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI)
316 }
317 if cpuLimit != nil || memLimit != nil || cpuRequest != nil {
318 cStatusResources = &kubecontainer.ContainerResources{
319 CPULimit: cpuLimit,
320 CPURequest: cpuRequest,
321 MemoryLimit: memLimit,
322 }
323 }
324 }
325 return cStatusResources
326 }
327
328
329
330
331 var isCgroup2UnifiedMode = func() bool {
332 return libcontainercgroups.IsCgroup2UnifiedMode()
333 }
334
335 var (
336 swapControllerAvailability bool
337 swapControllerAvailabilityOnce sync.Once
338 )
339
340
341
342
343 var swapControllerAvailable = func() bool {
344
345 swapControllerAvailabilityOnce.Do(func() {
346 const warn = "Failed to detect the availability of the swap controller, assuming not available"
347 p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
348 if isCgroup2UnifiedMode() {
349
350 _, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup")
351 if err != nil {
352 klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn)
353 return
354 }
355 p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
356 }
357 if _, err := os.Stat(p); err != nil {
358 if !errors.Is(err, os.ErrNotExist) {
359 klog.V(5).ErrorS(err, warn)
360 }
361 return
362 }
363 swapControllerAvailability = true
364 })
365 return swapControllerAvailability
366 }
367
368 type swapConfigurationHelper struct {
369 machineInfo cadvisorv1.MachineInfo
370 }
371
372 func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
373 return &swapConfigurationHelper{machineInfo: machineInfo}
374 }
375
376 func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
377 podQos := kubeapiqos.GetPodQOS(pod)
378 containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
379 memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
380
381 if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
382 m.ConfigureNoSwap(lcr)
383 return
384 }
385
386 containerMemoryRequest := container.Resources.Requests.Memory()
387 swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
388
389 if err != nil {
390 klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
391 m.ConfigureNoSwap(lcr)
392 return
393 }
394
395 m.configureSwap(lcr, swapLimit)
396 }
397
398 func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
399 if !isCgroup2UnifiedMode() {
400 if swapControllerAvailable() {
401
402
403
404 lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
405 }
406 return
407 }
408
409 m.configureSwap(lcr, 0)
410 }
411
412 func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
413 if !isCgroup2UnifiedMode() {
414 klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
415 return
416 }
417
418 if lcr.Unified == nil {
419 lcr.Unified = map[string]string{}
420 }
421
422 lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
423 }
424
425
426
427 func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
428 if nodeTotalMemory <= 0 {
429 return 0, fmt.Errorf("total node memory is 0")
430 }
431 if containerMemoryRequest > nodeTotalMemory {
432 return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
433 }
434
435 containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
436 swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
437
438 return int64(swapAllocation), nil
439 }
440
View as plain text