1
16
17 package metrics
18
19 import (
20 "sync"
21
22 "k8s.io/component-base/metrics"
23 "k8s.io/component-base/metrics/legacyregistry"
24 )
25
26
27 const JobControllerSubsystem = "job_controller"
28
29 var (
30
31
32
33
34
35 JobSyncDurationSeconds = metrics.NewHistogramVec(
36 &metrics.HistogramOpts{
37 Subsystem: JobControllerSubsystem,
38 Name: "job_sync_duration_seconds",
39 Help: "The time it took to sync a job",
40 StabilityLevel: metrics.STABLE,
41 Buckets: metrics.ExponentialBuckets(0.004, 2, 15),
42 },
43 []string{"completion_mode", "result", "action"},
44 )
45
46
47
48
49 JobSyncNum = metrics.NewCounterVec(
50 &metrics.CounterOpts{
51 Subsystem: JobControllerSubsystem,
52 Name: "job_syncs_total",
53 Help: "The number of job syncs",
54 StabilityLevel: metrics.STABLE,
55 },
56 []string{"completion_mode", "result", "action"},
57 )
58
59
60
61
62
63
64 JobFinishedNum = metrics.NewCounterVec(
65 &metrics.CounterOpts{
66 Subsystem: JobControllerSubsystem,
67 Name: "jobs_finished_total",
68 Help: "The number of finished jobs",
69 StabilityLevel: metrics.STABLE,
70 },
71 []string{"completion_mode", "result", "reason"},
72 )
73
74
75
76
77
78 JobByExternalControllerTotal = metrics.NewCounterVec(
79 &metrics.CounterOpts{
80 Subsystem: JobControllerSubsystem,
81 Name: "jobs_by_external_controller_total",
82 Help: "The number of Jobs managed by an external controller",
83 StabilityLevel: metrics.ALPHA,
84 },
85 []string{"controller_name"},
86 )
87
88
89
90
91
92
93
94
95 JobPodsFinished = metrics.NewCounterVec(
96 &metrics.CounterOpts{
97 Subsystem: JobControllerSubsystem,
98 Name: "job_pods_finished_total",
99 Help: "The number of finished Pods that are fully tracked",
100 StabilityLevel: metrics.STABLE,
101 },
102 []string{"completion_mode", "result"})
103
104
105
106
107
108 PodFailuresHandledByFailurePolicy = metrics.NewCounterVec(
109 &metrics.CounterOpts{
110 Subsystem: JobControllerSubsystem,
111 Name: "pod_failures_handled_by_failure_policy_total",
112 Help: `The number of failed Pods handled by failure policy with
113 respect to the failure policy action applied based on the matched
114 rule. Possible values of the action label correspond to the
115 possible values for the failure policy rule action, which are:
116 "FailJob", "Ignore" and "Count".`,
117 },
118 []string{"action"})
119
120
121
122
123 TerminatedPodsTrackingFinalizerTotal = metrics.NewCounterVec(
124 &metrics.CounterOpts{
125 Subsystem: JobControllerSubsystem,
126 Name: "terminated_pods_tracking_finalizer_total",
127 Help: `The number of terminated pods (phase=Failed|Succeeded)
128 that have the finalizer batch.kubernetes.io/job-tracking
129 The event label can be "add" or "delete".`,
130 }, []string{"event"})
131
132
133 JobFinishedIndexesTotal = metrics.NewCounterVec(
134 &metrics.CounterOpts{
135 Subsystem: JobControllerSubsystem,
136 Name: "job_finished_indexes_total",
137 Help: `The number of finished indexes. Possible values for the
138 status label are: "succeeded", "failed". Possible values for the
139 backoffLimit label are: "perIndex" and "global"`,
140 },
141 []string{"status", "backoffLimit"})
142
143
144
145
146
147
148
149 JobPodsCreationTotal = metrics.NewCounterVec(
150 &metrics.CounterOpts{
151 Subsystem: JobControllerSubsystem,
152 Name: "job_pods_creation_total",
153 Help: `The number of Pods created by the Job controller labelled with a reason for the Pod creation.
154 This metric also distinguishes between Pods created using different PodReplacementPolicy settings.
155 Possible values of the "reason" label are:
156 "new", "recreate_terminating_or_failed", "recreate_failed".
157 Possible values of the "status" label are:
158 "succeeded", "failed".`,
159 }, []string{"reason", "status"})
160 )
161
162 const (
163
164
165
166
167
168 JobSyncActionReconciling = "reconciling"
169
170
171
172
173 JobSyncActionTracking = "tracking"
174
175
176 JobSyncActionPodsCreated = "pods_created"
177
178
179
180 JobSyncActionPodsDeleted = "pods_deleted"
181
182
183
184 Succeeded = "succeeded"
185 Failed = "failed"
186
187
188
189 Add = "add"
190 Delete = "delete"
191
192
193
194 PodCreateNew = "new"
195 PodRecreateTerminatingOrFailed = "recreate_terminating_or_failed"
196 PodRecreateFailed = "recreate_failed"
197 )
198
199 var registerMetrics sync.Once
200
201
202 func Register() {
203 registerMetrics.Do(func() {
204 legacyregistry.MustRegister(JobSyncDurationSeconds)
205 legacyregistry.MustRegister(JobSyncNum)
206 legacyregistry.MustRegister(JobFinishedNum)
207 legacyregistry.MustRegister(JobPodsFinished)
208 legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy)
209 legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal)
210 legacyregistry.MustRegister(JobFinishedIndexesTotal)
211 legacyregistry.MustRegister(JobPodsCreationTotal)
212 legacyregistry.MustRegister(JobByExternalControllerTotal)
213 })
214 }
215
View as plain text