1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package etcdserver
16
17 import (
18 goruntime "runtime"
19 "time"
20
21 "go.etcd.io/etcd/api/v3/version"
22 "go.etcd.io/etcd/pkg/v3/runtime"
23
24 "github.com/prometheus/client_golang/prometheus"
25 "go.uber.org/zap"
26 )
27
28 var (
29 hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
30 Namespace: "etcd",
31 Subsystem: "server",
32 Name: "has_leader",
33 Help: "Whether or not a leader exists. 1 is existence, 0 is not.",
34 })
35 isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
36 Namespace: "etcd",
37 Subsystem: "server",
38 Name: "is_leader",
39 Help: "Whether or not this member is a leader. 1 if is, 0 otherwise.",
40 })
41 leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
42 Namespace: "etcd",
43 Subsystem: "server",
44 Name: "leader_changes_seen_total",
45 Help: "The number of leader changes seen.",
46 })
47 isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
48 Namespace: "etcd",
49 Subsystem: "server",
50 Name: "is_learner",
51 Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.",
52 })
53 learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
54 Namespace: "etcd",
55 Subsystem: "server",
56 Name: "learner_promote_failures",
57 Help: "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
58 },
59 []string{"Reason"},
60 )
61 learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
62 Namespace: "etcd",
63 Subsystem: "server",
64 Name: "learner_promote_successes",
65 Help: "The total number of successful learner promotions while this member is leader.",
66 })
67 heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
68 Namespace: "etcd",
69 Subsystem: "server",
70 Name: "heartbeat_send_failures_total",
71 Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
72 })
73 slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
74 Namespace: "etcd",
75 Subsystem: "server",
76 Name: "slow_apply_total",
77 Help: "The total number of slow apply requests (likely overloaded from slow disk).",
78 })
79 applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
80 Namespace: "etcd",
81 Subsystem: "server",
82 Name: "snapshot_apply_in_progress_total",
83 Help: "1 if the server is applying the incoming snapshot. 0 if none.",
84 })
85 proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
86 Namespace: "etcd",
87 Subsystem: "server",
88 Name: "proposals_committed_total",
89 Help: "The total number of consensus proposals committed.",
90 })
91 proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
92 Namespace: "etcd",
93 Subsystem: "server",
94 Name: "proposals_applied_total",
95 Help: "The total number of consensus proposals applied.",
96 })
97 proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
98 Namespace: "etcd",
99 Subsystem: "server",
100 Name: "proposals_pending",
101 Help: "The current number of pending proposals to commit.",
102 })
103 proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
104 Namespace: "etcd",
105 Subsystem: "server",
106 Name: "proposals_failed_total",
107 Help: "The total number of failed proposals seen.",
108 })
109 slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
110 Namespace: "etcd",
111 Subsystem: "server",
112 Name: "slow_read_indexes_total",
113 Help: "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
114 })
115 readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
116 Namespace: "etcd",
117 Subsystem: "server",
118 Name: "read_indexes_failed_total",
119 Help: "The total number of failed read indexes seen.",
120 })
121 leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
122 Namespace: "etcd_debugging",
123 Subsystem: "server",
124 Name: "lease_expired_total",
125 Help: "The total number of expired leases.",
126 })
127 quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
128 Namespace: "etcd",
129 Subsystem: "server",
130 Name: "quota_backend_bytes",
131 Help: "Current backend storage quota size in bytes.",
132 })
133 currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
134 Namespace: "etcd",
135 Subsystem: "server",
136 Name: "version",
137 Help: "Which version is running. 1 for 'server_version' label with current version.",
138 },
139 []string{"server_version"})
140 currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
141 Namespace: "etcd",
142 Subsystem: "server",
143 Name: "go_version",
144 Help: "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
145 },
146 []string{"server_go_version"})
147 serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
148 Namespace: "etcd",
149 Subsystem: "server",
150 Name: "id",
151 Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
152 },
153 []string{"server_id"})
154
155 fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
156 Namespace: "os",
157 Subsystem: "fd",
158 Name: "used",
159 Help: "The number of used file descriptors.",
160 })
161 fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
162 Namespace: "os",
163 Subsystem: "fd",
164 Name: "limit",
165 Help: "The file descriptor limit.",
166 })
167 applySec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
168 Namespace: "etcd",
169 Subsystem: "server",
170 Name: "apply_duration_seconds",
171 Help: "The latency distributions of v2 apply called by backend.",
172
173
174
175 Buckets: prometheus.ExponentialBuckets(0.0001, 2, 20),
176 },
177 []string{"version", "op", "success"})
178 )
179
180 func init() {
181 prometheus.MustRegister(hasLeader)
182 prometheus.MustRegister(isLeader)
183 prometheus.MustRegister(leaderChanges)
184 prometheus.MustRegister(heartbeatSendFailures)
185 prometheus.MustRegister(slowApplies)
186 prometheus.MustRegister(applySnapshotInProgress)
187 prometheus.MustRegister(proposalsCommitted)
188 prometheus.MustRegister(proposalsApplied)
189 prometheus.MustRegister(proposalsPending)
190 prometheus.MustRegister(proposalsFailed)
191 prometheus.MustRegister(slowReadIndex)
192 prometheus.MustRegister(readIndexFailed)
193 prometheus.MustRegister(leaseExpired)
194 prometheus.MustRegister(quotaBackendBytes)
195 prometheus.MustRegister(currentVersion)
196 prometheus.MustRegister(currentGoVersion)
197 prometheus.MustRegister(serverID)
198 prometheus.MustRegister(isLearner)
199 prometheus.MustRegister(learnerPromoteSucceed)
200 prometheus.MustRegister(learnerPromoteFailed)
201 prometheus.MustRegister(fdUsed)
202 prometheus.MustRegister(fdLimit)
203 prometheus.MustRegister(applySec)
204
205 currentVersion.With(prometheus.Labels{
206 "server_version": version.Version,
207 }).Set(1)
208 currentGoVersion.With(prometheus.Labels{
209 "server_go_version": goruntime.Version(),
210 }).Set(1)
211 }
212
213 func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
214
215
216
217
218
219 ticker := time.NewTicker(10 * time.Minute)
220 defer ticker.Stop()
221 for {
222 used, err := runtime.FDUsage()
223 if err != nil {
224 lg.Warn("failed to get file descriptor usage", zap.Error(err))
225 return
226 }
227 fdUsed.Set(float64(used))
228 limit, err := runtime.FDLimit()
229 if err != nil {
230 lg.Warn("failed to get file descriptor limit", zap.Error(err))
231 return
232 }
233 fdLimit.Set(float64(limit))
234 if used >= limit/5*4 {
235 lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
236 }
237 select {
238 case <-ticker.C:
239 case <-done:
240 return
241 }
242 }
243 }
244
View as plain text