1
16
17 package cm
18
19 import (
20 "fmt"
21 "strconv"
22 "strings"
23 "sync"
24 "time"
25
26 v1 "k8s.io/api/core/v1"
27 "k8s.io/klog/v2"
28
29 "k8s.io/apimachinery/pkg/util/wait"
30
31 units "github.com/docker/go-units"
32 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
33 utilfeature "k8s.io/apiserver/pkg/util/feature"
34
35 "k8s.io/kubernetes/pkg/api/v1/resource"
36 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
37 kubefeatures "k8s.io/kubernetes/pkg/features"
38 )
39
40 const (
41
42
43 periodicQOSCgroupUpdateInterval = 1 * time.Minute
44 )
45
46 type QOSContainerManager interface {
47 Start(func() v1.ResourceList, ActivePodsFunc) error
48 GetQOSContainersInfo() QOSContainersInfo
49 UpdateCgroups() error
50 }
51
52 type qosContainerManagerImpl struct {
53 sync.Mutex
54 qosContainersInfo QOSContainersInfo
55 subsystems *CgroupSubsystems
56 cgroupManager CgroupManager
57 activePods ActivePodsFunc
58 getNodeAllocatable func() v1.ResourceList
59 cgroupRoot CgroupName
60 qosReserved map[v1.ResourceName]int64
61 }
62
63 func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
64 if !nodeConfig.CgroupsPerQOS {
65 return &qosContainerManagerNoop{
66 cgroupRoot: cgroupRoot,
67 }, nil
68 }
69
70 return &qosContainerManagerImpl{
71 subsystems: subsystems,
72 cgroupManager: cgroupManager,
73 cgroupRoot: cgroupRoot,
74 qosReserved: nodeConfig.QOSReserved,
75 }, nil
76 }
77
78 func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
79 return m.qosContainersInfo
80 }
81
82 func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
83 cm := m.cgroupManager
84 rootContainer := m.cgroupRoot
85 if !cm.Exists(rootContainer) {
86 return fmt.Errorf("root container %v doesn't exist", rootContainer)
87 }
88
89
90
91 qosClasses := map[v1.PodQOSClass]CgroupName{
92 v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
93 v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
94 }
95
96
97 for qosClass, containerName := range qosClasses {
98 resourceParameters := &ResourceConfig{}
99
100 if qosClass == v1.PodQOSBestEffort {
101 minShares := uint64(MinShares)
102 resourceParameters.CPUShares = &minShares
103 }
104
105
106 containerConfig := &CgroupConfig{
107 Name: containerName,
108 ResourceParameters: resourceParameters,
109 }
110
111
112 m.setHugePagesUnbounded(containerConfig)
113
114
115 if !cm.Exists(containerName) {
116 if err := cm.Create(containerConfig); err != nil {
117 return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
118 }
119 } else {
120
121 if err := cm.Update(containerConfig); err != nil {
122 return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
123 }
124 }
125 }
126
127 m.qosContainersInfo = QOSContainersInfo{
128 Guaranteed: rootContainer,
129 Burstable: qosClasses[v1.PodQOSBurstable],
130 BestEffort: qosClasses[v1.PodQOSBestEffort],
131 }
132 m.getNodeAllocatable = getNodeAllocatable
133 m.activePods = activePods
134
135
136
137 go wait.Until(func() {
138 err := m.UpdateCgroups()
139 if err != nil {
140 klog.InfoS("Failed to reserve QoS requests", "err", err)
141 }
142 }, periodicQOSCgroupUpdateInterval, wait.NeverStop)
143
144 return nil
145 }
146
147
148 func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
149 hugePageLimit := map[int64]int64{}
150 for _, pageSize := range libcontainercgroups.HugePageSizes() {
151 pageSizeBytes, err := units.RAMInBytes(pageSize)
152 if err != nil {
153 return err
154 }
155 hugePageLimit[pageSizeBytes] = int64(1 << 62)
156 }
157 cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
158 return nil
159 }
160
161 func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
162 for _, v := range configs {
163 if err := m.setHugePagesUnbounded(v); err != nil {
164 return err
165 }
166 }
167 return nil
168 }
169
170 func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
171 pods := m.activePods()
172 burstablePodCPURequest := int64(0)
173 reuseReqs := make(v1.ResourceList, 4)
174 for i := range pods {
175 pod := pods[i]
176 qosClass := v1qos.GetPodQOS(pod)
177 if qosClass != v1.PodQOSBurstable {
178
179 continue
180 }
181 req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs})
182 if request, found := req[v1.ResourceCPU]; found {
183 burstablePodCPURequest += request.MilliValue()
184 }
185 }
186
187
188 bestEffortCPUShares := uint64(MinShares)
189 configs[v1.PodQOSBestEffort].ResourceParameters.CPUShares = &bestEffortCPUShares
190
191
192 burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
193 configs[v1.PodQOSBurstable].ResourceParameters.CPUShares = &burstableCPUShares
194 return nil
195 }
196
197
198
199 func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 {
200 qosMemoryRequests := map[v1.PodQOSClass]int64{
201 v1.PodQOSGuaranteed: 0,
202 v1.PodQOSBurstable: 0,
203 }
204
205
206 pods := m.activePods()
207 reuseReqs := make(v1.ResourceList, 4)
208 for _, pod := range pods {
209 podMemoryRequest := int64(0)
210 qosClass := v1qos.GetPodQOS(pod)
211 if qosClass == v1.PodQOSBestEffort {
212
213 continue
214 }
215 req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs})
216 if request, found := req[v1.ResourceMemory]; found {
217 podMemoryRequest += request.Value()
218 }
219 qosMemoryRequests[qosClass] += podMemoryRequest
220 }
221
222 return qosMemoryRequests
223 }
224
225
226
227
228 func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
229 qosMemoryRequests := m.getQoSMemoryRequests()
230
231 resources := m.getNodeAllocatable()
232 allocatableResource, ok := resources[v1.ResourceMemory]
233 if !ok {
234 klog.V(2).InfoS("Allocatable memory value could not be determined, not setting QoS memory limits")
235 return
236 }
237 allocatable := allocatableResource.Value()
238 if allocatable == 0 {
239 klog.V(2).InfoS("Allocatable memory reported as 0, might be in standalone mode, not setting QoS memory limits")
240 return
241 }
242
243 for qos, limits := range qosMemoryRequests {
244 klog.V(2).InfoS("QoS pod memory limit", "qos", qos, "limits", limits, "percentReserve", percentReserve)
245 }
246
247
248 burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
249 bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
250 configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
251 configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
252 }
253
254
255
256
257 func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
258
259
260
261 for qos, config := range configs {
262 usage, err := m.cgroupManager.MemoryUsage(config.Name)
263 if err != nil {
264 klog.V(2).InfoS("Failed to get resource stats", "err", err)
265 return
266 }
267
268
269
270
271
272
273
274 if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
275 configs[qos].ResourceParameters.Memory = &usage
276 }
277 }
278 }
279
280
281
282 func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) {
283 qosMemoryRequests := m.getQoSMemoryRequests()
284
285
286
287
288 burstableMin := qosMemoryRequests[v1.PodQOSBurstable]
289 guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin
290
291 if burstableMin > 0 {
292 if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
293 configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
294 }
295 configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
296 klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
297 }
298
299 if guaranteedMin > 0 {
300 if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
301 configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
302 }
303 configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
304 klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
305 }
306 }
307
308 func (m *qosContainerManagerImpl) UpdateCgroups() error {
309 m.Lock()
310 defer m.Unlock()
311
312 qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
313 v1.PodQOSGuaranteed: {
314 Name: m.qosContainersInfo.Guaranteed,
315 ResourceParameters: &ResourceConfig{},
316 },
317 v1.PodQOSBurstable: {
318 Name: m.qosContainersInfo.Burstable,
319 ResourceParameters: &ResourceConfig{},
320 },
321 v1.PodQOSBestEffort: {
322 Name: m.qosContainersInfo.BestEffort,
323 ResourceParameters: &ResourceConfig{},
324 },
325 }
326
327
328 if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
329 return err
330 }
331
332
333 if err := m.setHugePagesConfig(qosConfigs); err != nil {
334 return err
335 }
336
337
338 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
339 libcontainercgroups.IsCgroup2UnifiedMode() {
340 m.setMemoryQoS(qosConfigs)
341 }
342
343 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
344 for resource, percentReserve := range m.qosReserved {
345 switch resource {
346 case v1.ResourceMemory:
347 m.setMemoryReserve(qosConfigs, percentReserve)
348 }
349 }
350
351 updateSuccess := true
352 for _, config := range qosConfigs {
353 err := m.cgroupManager.Update(config)
354 if err != nil {
355 updateSuccess = false
356 }
357 }
358 if updateSuccess {
359 klog.V(4).InfoS("Updated QoS cgroup configuration")
360 return nil
361 }
362
363
364
365
366 for resource, percentReserve := range m.qosReserved {
367 switch resource {
368 case v1.ResourceMemory:
369 m.retrySetMemoryReserve(qosConfigs, percentReserve)
370 }
371 }
372 }
373
374 for _, config := range qosConfigs {
375 err := m.cgroupManager.Update(config)
376 if err != nil {
377 klog.ErrorS(err, "Failed to update QoS cgroup configuration")
378 return err
379 }
380 }
381
382 klog.V(4).InfoS("Updated QoS cgroup configuration")
383 return nil
384 }
385
386 type qosContainerManagerNoop struct {
387 cgroupRoot CgroupName
388 }
389
390 var _ QOSContainerManager = &qosContainerManagerNoop{}
391
392 func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
393 return QOSContainersInfo{}
394 }
395
396 func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
397 return nil
398 }
399
400 func (m *qosContainerManagerNoop) UpdateCgroups() error {
401 return nil
402 }
403
View as plain text