...

Source file src/edge-infra.dev/pkg/edge/chariot/metrics.go

Documentation: edge-infra.dev/pkg/edge/chariot

     1  package chariot
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	"github.com/prometheus/client_golang/prometheus/promauto"
    11  	"google.golang.org/api/googleapi"
    12  )
    13  
    14  const (
    15  	MetricNameDeadlineExceededTotal  = "chariot2_deadline_exceeded_total"
    16  	MetricNameErrorsTotal            = "chariot2_errors_total"
    17  	MetricNameGoogleAPIErrorsTotal   = "chariot2_google_api_errors_total"
    18  	MetricNameRequestsTotal          = "chariot2_requests_total"
    19  	MetricNameStorageOperationsTotal = "chariot2_storage_operations_total"
    20  )
    21  
    22  var MetricsCounterVecHelp = map[string]string{
    23  	MetricNameDeadlineExceededTotal:  "tracks how many times context.DeadlineExceeded errors result in a failed request",
    24  	MetricNameErrorsTotal:            "tracks the total amount of errors that caused a request to fail",
    25  	MetricNameGoogleAPIErrorsTotal:   "tracks every google api error",
    26  	MetricNameRequestsTotal:          "Total amount of requests, their operation, and whether they succeeded",
    27  	MetricNameStorageOperationsTotal: "Total amount of storage operations chariot performs",
    28  }
    29  
    30  var MetricsLabels = map[string][]string{
    31  	MetricNameDeadlineExceededTotal:  {"timeout"},
    32  	MetricNameErrorsTotal:            {"operation"},
    33  	MetricNameGoogleAPIErrorsTotal:   {"status_code", "reason"},
    34  	MetricNameRequestsTotal:          {"operation", "successful"},
    35  	MetricNameStorageOperationsTotal: {"operation", "dir", "scope", "result"},
    36  }
    37  
    38  type Metrics struct {
    39  	deadlineExceededTotal *prometheus.CounterVec
    40  	errorsTotal           *prometheus.CounterVec
    41  	googleAPIErrorsTotal  *prometheus.CounterVec
    42  	requestsTotal         *prometheus.CounterVec
    43  	storageOpsTotal       *prometheus.CounterVec
    44  }
    45  
    46  // metricsOnce protects importers of Chariot from automatically registering chariot2 metrics.
    47  // metricsOnce also prevents a promauto panic.
    48  var metricsOnce struct {
    49  	sync.Once
    50  	m *Metrics
    51  }
    52  
    53  func NewMetrics() *Metrics {
    54  	metricsOnce.Do(func() {
    55  		var counters = make(map[string]*prometheus.CounterVec)
    56  		for name, help := range MetricsCounterVecHelp {
    57  			counters[name] = promauto.NewCounterVec(prometheus.CounterOpts{
    58  				Name: name,
    59  				Help: help,
    60  			}, MetricsLabels[name])
    61  		}
    62  		metricsOnce.m = &Metrics{
    63  			deadlineExceededTotal: counters[MetricNameDeadlineExceededTotal],
    64  			errorsTotal:           counters[MetricNameErrorsTotal],
    65  			googleAPIErrorsTotal:  counters[MetricNameGoogleAPIErrorsTotal],
    66  			requestsTotal:         counters[MetricNameRequestsTotal],
    67  			storageOpsTotal:       counters[MetricNameStorageOperationsTotal],
    68  		}
    69  	})
    70  	return metricsOnce.m
    71  }
    72  
    73  func (m *Metrics) IncErrorsTotal(req Request) {
    74  	var l = prometheus.Labels{"operation": req.Operation}
    75  	m.errorsTotal.With(l).Inc()
    76  }
    77  
    78  func (m *Metrics) IncGoogleAPIErrorsTotal(err error) {
    79  	var gerr *googleapi.Error
    80  	if !errors.As(err, &gerr) {
    81  		return // return without doing anything
    82  	}
    83  	var l = prometheus.Labels{
    84  		"status_code": fmt.Sprint(gerr.Code),
    85  	}
    86  
    87  	// The googleapi.Error might provide an Errors field which contains more detailed reasons for the failure.
    88  	if len(gerr.Errors) == 0 {
    89  		l["reason"] = "unknown"
    90  		m.googleAPIErrorsTotal.With(l).Inc()
    91  	} else {
    92  		// Not sure if we'll ever see multiple errors, but loop through them anyway. Increase the count for each
    93  		// ErrorItem in Errors.
    94  		for _, ei := range gerr.Errors {
    95  			l["reason"] = ei.Reason // Reason is a typed error code, and therefore has bound orthogonality.
    96  			m.googleAPIErrorsTotal.With(l).Inc()
    97  		}
    98  	}
    99  }
   100  
   101  func (m *Metrics) IncDeadlineExceededTotal(timeout time.Duration) {
   102  	var l = prometheus.Labels{
   103  		"timeout": fmt.Sprint(timeout),
   104  	}
   105  	m.deadlineExceededTotal.With(l).Inc()
   106  }
   107  
   108  func (m *Metrics) IncRequestsTotal(req Request, successful bool) {
   109  	var op = req.Operation
   110  	if op == "" {
   111  		op = "unknown" // this will be set whenever requests can't be unmarshaled.
   112  	}
   113  	var l = prometheus.Labels{
   114  		"operation":  op,
   115  		"successful": fmt.Sprint(successful),
   116  	}
   117  	m.requestsTotal.With(l).Inc()
   118  }
   119  
   120  func (m *Metrics) IncStorageOperationsTotal(req Request, si StorageInfo) {
   121  	var op = req.Operation
   122  	var dir = req.Dir    // orthogonality is not a concern since the dirs are (should be) limited to a small whitelisted set. Good to know when it is used.
   123  	var scope = "banner" // Scope is whether it's a banner-wide or cluster-wide storage operation.
   124  	if req.Cluster != "" {
   125  		scope = "cluster"
   126  	}
   127  
   128  	var mets = map[string]int{
   129  		OperationCreate: len(si.ObjectsPut),
   130  		OperationDelete: len(si.ObjectsDeleted),
   131  		"DNE":           len(si.ObjectsDoNotExist),
   132  		"ERROR":         len(si.Errors),
   133  	}
   134  
   135  	for result, count := range mets {
   136  		if count != 0 {
   137  			var l = prometheus.Labels{
   138  				"operation": op,
   139  				"dir":       dir,
   140  				"scope":     scope,
   141  				"result":    result,
   142  			}
   143  			m.storageOpsTotal.With(l).Add(float64(count))
   144  		}
   145  	}
   146  }
   147  

View as plain text