...

Source file src/go.etcd.io/etcd/server/v3/etcdserver/metrics.go

Documentation: go.etcd.io/etcd/server/v3/etcdserver

     1  // Copyright 2015 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package etcdserver
    16  
    17  import (
    18  	goruntime "runtime"
    19  	"time"
    20  
    21  	"go.etcd.io/etcd/api/v3/version"
    22  	"go.etcd.io/etcd/pkg/v3/runtime"
    23  
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	"go.uber.org/zap"
    26  )
    27  
    28  var (
    29  	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
    30  		Namespace: "etcd",
    31  		Subsystem: "server",
    32  		Name:      "has_leader",
    33  		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
    34  	})
    35  	isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
    36  		Namespace: "etcd",
    37  		Subsystem: "server",
    38  		Name:      "is_leader",
    39  		Help:      "Whether or not this member is a leader. 1 if is, 0 otherwise.",
    40  	})
    41  	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
    42  		Namespace: "etcd",
    43  		Subsystem: "server",
    44  		Name:      "leader_changes_seen_total",
    45  		Help:      "The number of leader changes seen.",
    46  	})
    47  	isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
    48  		Namespace: "etcd",
    49  		Subsystem: "server",
    50  		Name:      "is_learner",
    51  		Help:      "Whether or not this member is a learner. 1 if is, 0 otherwise.",
    52  	})
    53  	learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
    54  		Namespace: "etcd",
    55  		Subsystem: "server",
    56  		Name:      "learner_promote_failures",
    57  		Help:      "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
    58  	},
    59  		[]string{"Reason"},
    60  	)
    61  	learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
    62  		Namespace: "etcd",
    63  		Subsystem: "server",
    64  		Name:      "learner_promote_successes",
    65  		Help:      "The total number of successful learner promotions while this member is leader.",
    66  	})
    67  	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
    68  		Namespace: "etcd",
    69  		Subsystem: "server",
    70  		Name:      "heartbeat_send_failures_total",
    71  		Help:      "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
    72  	})
    73  	slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
    74  		Namespace: "etcd",
    75  		Subsystem: "server",
    76  		Name:      "slow_apply_total",
    77  		Help:      "The total number of slow apply requests (likely overloaded from slow disk).",
    78  	})
    79  	applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
    80  		Namespace: "etcd",
    81  		Subsystem: "server",
    82  		Name:      "snapshot_apply_in_progress_total",
    83  		Help:      "1 if the server is applying the incoming snapshot. 0 if none.",
    84  	})
    85  	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
    86  		Namespace: "etcd",
    87  		Subsystem: "server",
    88  		Name:      "proposals_committed_total",
    89  		Help:      "The total number of consensus proposals committed.",
    90  	})
    91  	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
    92  		Namespace: "etcd",
    93  		Subsystem: "server",
    94  		Name:      "proposals_applied_total",
    95  		Help:      "The total number of consensus proposals applied.",
    96  	})
    97  	proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
    98  		Namespace: "etcd",
    99  		Subsystem: "server",
   100  		Name:      "proposals_pending",
   101  		Help:      "The current number of pending proposals to commit.",
   102  	})
   103  	proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
   104  		Namespace: "etcd",
   105  		Subsystem: "server",
   106  		Name:      "proposals_failed_total",
   107  		Help:      "The total number of failed proposals seen.",
   108  	})
   109  	slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
   110  		Namespace: "etcd",
   111  		Subsystem: "server",
   112  		Name:      "slow_read_indexes_total",
   113  		Help:      "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
   114  	})
   115  	readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
   116  		Namespace: "etcd",
   117  		Subsystem: "server",
   118  		Name:      "read_indexes_failed_total",
   119  		Help:      "The total number of failed read indexes seen.",
   120  	})
   121  	leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
   122  		Namespace: "etcd_debugging",
   123  		Subsystem: "server",
   124  		Name:      "lease_expired_total",
   125  		Help:      "The total number of expired leases.",
   126  	})
   127  	quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
   128  		Namespace: "etcd",
   129  		Subsystem: "server",
   130  		Name:      "quota_backend_bytes",
   131  		Help:      "Current backend storage quota size in bytes.",
   132  	})
   133  	currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   134  		Namespace: "etcd",
   135  		Subsystem: "server",
   136  		Name:      "version",
   137  		Help:      "Which version is running. 1 for 'server_version' label with current version.",
   138  	},
   139  		[]string{"server_version"})
   140  	currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   141  		Namespace: "etcd",
   142  		Subsystem: "server",
   143  		Name:      "go_version",
   144  		Help:      "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
   145  	},
   146  		[]string{"server_go_version"})
   147  	serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   148  		Namespace: "etcd",
   149  		Subsystem: "server",
   150  		Name:      "id",
   151  		Help:      "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
   152  	},
   153  		[]string{"server_id"})
   154  
   155  	fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
   156  		Namespace: "os",
   157  		Subsystem: "fd",
   158  		Name:      "used",
   159  		Help:      "The number of used file descriptors.",
   160  	})
   161  	fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
   162  		Namespace: "os",
   163  		Subsystem: "fd",
   164  		Name:      "limit",
   165  		Help:      "The file descriptor limit.",
   166  	})
   167  	applySec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   168  		Namespace: "etcd",
   169  		Subsystem: "server",
   170  		Name:      "apply_duration_seconds",
   171  		Help:      "The latency distributions of v2 apply called by backend.",
   172  
   173  		// lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2
   174  		// highest bucket start of 0.0001 sec * 2^19 == 52.4288 sec
   175  		Buckets: prometheus.ExponentialBuckets(0.0001, 2, 20),
   176  	},
   177  		[]string{"version", "op", "success"})
   178  )
   179  
   180  func init() {
   181  	prometheus.MustRegister(hasLeader)
   182  	prometheus.MustRegister(isLeader)
   183  	prometheus.MustRegister(leaderChanges)
   184  	prometheus.MustRegister(heartbeatSendFailures)
   185  	prometheus.MustRegister(slowApplies)
   186  	prometheus.MustRegister(applySnapshotInProgress)
   187  	prometheus.MustRegister(proposalsCommitted)
   188  	prometheus.MustRegister(proposalsApplied)
   189  	prometheus.MustRegister(proposalsPending)
   190  	prometheus.MustRegister(proposalsFailed)
   191  	prometheus.MustRegister(slowReadIndex)
   192  	prometheus.MustRegister(readIndexFailed)
   193  	prometheus.MustRegister(leaseExpired)
   194  	prometheus.MustRegister(quotaBackendBytes)
   195  	prometheus.MustRegister(currentVersion)
   196  	prometheus.MustRegister(currentGoVersion)
   197  	prometheus.MustRegister(serverID)
   198  	prometheus.MustRegister(isLearner)
   199  	prometheus.MustRegister(learnerPromoteSucceed)
   200  	prometheus.MustRegister(learnerPromoteFailed)
   201  	prometheus.MustRegister(fdUsed)
   202  	prometheus.MustRegister(fdLimit)
   203  	prometheus.MustRegister(applySec)
   204  
   205  	currentVersion.With(prometheus.Labels{
   206  		"server_version": version.Version,
   207  	}).Set(1)
   208  	currentGoVersion.With(prometheus.Labels{
   209  		"server_go_version": goruntime.Version(),
   210  	}).Set(1)
   211  }
   212  
   213  func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
   214  	// This ticker will check File Descriptor Requirements ,and count all fds in used.
   215  	// And recorded some logs when in used >= limit/5*4. Just recorded message.
   216  	// If fds was more than 10K,It's low performance due to FDUsage() works.
   217  	// So need to increase it.
   218  	// See https://github.com/etcd-io/etcd/issues/11969 for more detail.
   219  	ticker := time.NewTicker(10 * time.Minute)
   220  	defer ticker.Stop()
   221  	for {
   222  		used, err := runtime.FDUsage()
   223  		if err != nil {
   224  			lg.Warn("failed to get file descriptor usage", zap.Error(err))
   225  			return
   226  		}
   227  		fdUsed.Set(float64(used))
   228  		limit, err := runtime.FDLimit()
   229  		if err != nil {
   230  			lg.Warn("failed to get file descriptor limit", zap.Error(err))
   231  			return
   232  		}
   233  		fdLimit.Set(float64(limit))
   234  		if used >= limit/5*4 {
   235  			lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
   236  		}
   237  		select {
   238  		case <-ticker.C:
   239  		case <-done:
   240  			return
   241  		}
   242  	}
   243  }
   244  

View as plain text