...

Text file src/github.com/prometheus/alertmanager/doc/alertmanager-mixin/alerts.libsonnet

Documentation: github.com/prometheus/alertmanager/doc/alertmanager-mixin

     1{
     2  prometheusAlerts+:: {
     3    groups+: [
     4      {
     5        name: 'alertmanager.rules',
     6        rules: [
     7          {
     8            alert: 'AlertmanagerFailedReload',
     9            expr: |||
    10              # Without max_over_time, failed scrapes could create false negatives, see
    11              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
    12              max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
    13            ||| % $._config,
    14            'for': '10m',
    15            labels: {
    16              severity: 'critical',
    17            },
    18            annotations: {
    19              summary: 'Reloading an Alertmanager configuration has failed.',
    20              description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
    21            },
    22          },
    23          {
    24            alert: 'AlertmanagerMembersInconsistent',
    25            expr: |||
    26              # Without max_over_time, failed scrapes could create false negatives, see
    27              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
    28                max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
    29              < on (%(alertmanagerClusterLabels)s) group_left
    30                count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
    31            ||| % $._config,
    32            'for': '15m',
    33            labels: {
    34              severity: 'critical',
    35            },
    36            annotations: {
    37              summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
    38              description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
    39            },
    40          },
    41          {
    42            alert: 'AlertmanagerFailedToSendAlerts',
    43            expr: |||
    44              (
    45                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
    46              /
    47                rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
    48              )
    49              > 0.01
    50            ||| % $._config,
    51            'for': '5m',
    52            labels: {
    53              severity: 'warning',
    54            },
    55            annotations: {
    56              summary: 'An Alertmanager instance failed to send notifications.',
    57              description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
    58            },
    59          },
    60          {
    61            alert: 'AlertmanagerClusterFailedToSendAlerts',
    62            expr: |||
    63              min by (%(alertmanagerClusterLabels)s, integration) (
    64                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
    65              /
    66                rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
    67              )
    68              > 0.01
    69            ||| % $._config,
    70            'for': '5m',
    71            labels: {
    72              severity: 'critical',
    73            },
    74            annotations: {
    75              summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
    76              description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
    77            },
    78          },
    79          {
    80            alert: 'AlertmanagerClusterFailedToSendAlerts',
    81            expr: |||
    82              min by (%(alertmanagerClusterLabels)s, integration) (
    83                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
    84              /
    85                rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
    86              )
    87              > 0.01
    88            ||| % $._config,
    89            'for': '5m',
    90            labels: {
    91              severity: 'warning',
    92            },
    93            annotations: {
    94              summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
    95              description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
    96            },
    97          },
    98          {
    99            alert: 'AlertmanagerConfigInconsistent',
   100            expr: |||
   101              count by (%(alertmanagerClusterLabels)s) (
   102                count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
   103              )
   104              != 1
   105            ||| % $._config,
   106            'for': '20m',  // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
   107            labels: {
   108              severity: 'critical',
   109            },
   110            annotations: {
   111              summary: 'Alertmanager instances within the same cluster have different configurations.',
   112              description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
   113            },
   114          },
   115          // Both the following critical alerts, AlertmanagerClusterDown and
   116          // AlertmanagerClusterCrashlooping, fire if a whole cluster is
   117          // unhealthy. It is implied that a generic warning alert is in place
   118          // for individual instances being down or crashlooping.
   119          {
   120            alert: 'AlertmanagerClusterDown',
   121            expr: |||
   122              (
   123                count by (%(alertmanagerClusterLabels)s) (
   124                  avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
   125                )
   126              /
   127                count by (%(alertmanagerClusterLabels)s) (
   128                  up{%(alertmanagerSelector)s}
   129                )
   130              )
   131              >= 0.5
   132            ||| % $._config,
   133            'for': '5m',
   134            labels: {
   135              severity: 'critical',
   136            },
   137            annotations: {
   138              summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
   139              description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
   140            },
   141          },
   142          {
   143            alert: 'AlertmanagerClusterCrashlooping',
   144            expr: |||
   145              (
   146                count by (%(alertmanagerClusterLabels)s) (
   147                  changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
   148                )
   149              /
   150                count by (%(alertmanagerClusterLabels)s) (
   151                  up{%(alertmanagerSelector)s}
   152                )
   153              )
   154              >= 0.5
   155            ||| % $._config,
   156            'for': '5m',
   157            labels: {
   158              severity: 'critical',
   159            },
   160            annotations: {
   161              summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
   162              description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
   163            },
   164          },
   165        ],
   166      },
   167    ],
   168  },
   169}

View as plain text