1{
2 prometheusAlerts+:: {
3 groups+: [
4 {
5 name: 'alertmanager.rules',
6 rules: [
7 {
8 alert: 'AlertmanagerFailedReload',
9 expr: |||
10 # Without max_over_time, failed scrapes could create false negatives, see
11 # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
12 max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
13 ||| % $._config,
14 'for': '10m',
15 labels: {
16 severity: 'critical',
17 },
18 annotations: {
19 summary: 'Reloading an Alertmanager configuration has failed.',
20 description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
21 },
22 },
23 {
24 alert: 'AlertmanagerMembersInconsistent',
25 expr: |||
26 # Without max_over_time, failed scrapes could create false negatives, see
27 # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
28 max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
29 < on (%(alertmanagerClusterLabels)s) group_left
30 count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
31 ||| % $._config,
32 'for': '15m',
33 labels: {
34 severity: 'critical',
35 },
36 annotations: {
37 summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
38 description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
39 },
40 },
41 {
42 alert: 'AlertmanagerFailedToSendAlerts',
43 expr: |||
44 (
45 rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
46 /
47 rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
48 )
49 > 0.01
50 ||| % $._config,
51 'for': '5m',
52 labels: {
53 severity: 'warning',
54 },
55 annotations: {
56 summary: 'An Alertmanager instance failed to send notifications.',
57 description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
58 },
59 },
60 {
61 alert: 'AlertmanagerClusterFailedToSendAlerts',
62 expr: |||
63 min by (%(alertmanagerClusterLabels)s, integration) (
64 rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
65 /
66 rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
67 )
68 > 0.01
69 ||| % $._config,
70 'for': '5m',
71 labels: {
72 severity: 'critical',
73 },
74 annotations: {
75 summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
76 description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
77 },
78 },
79 {
80 alert: 'AlertmanagerClusterFailedToSendAlerts',
81 expr: |||
82 min by (%(alertmanagerClusterLabels)s, integration) (
83 rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
84 /
85 rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
86 )
87 > 0.01
88 ||| % $._config,
89 'for': '5m',
90 labels: {
91 severity: 'warning',
92 },
93 annotations: {
94 summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
95 description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
96 },
97 },
98 {
99 alert: 'AlertmanagerConfigInconsistent',
100 expr: |||
101 count by (%(alertmanagerClusterLabels)s) (
102 count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
103 )
104 != 1
105 ||| % $._config,
106 'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
107 labels: {
108 severity: 'critical',
109 },
110 annotations: {
111 summary: 'Alertmanager instances within the same cluster have different configurations.',
112 description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
113 },
114 },
115 // Both the following critical alerts, AlertmanagerClusterDown and
116 // AlertmanagerClusterCrashlooping, fire if a whole cluster is
117 // unhealthy. It is implied that a generic warning alert is in place
118 // for individual instances being down or crashlooping.
119 {
120 alert: 'AlertmanagerClusterDown',
121 expr: |||
122 (
123 count by (%(alertmanagerClusterLabels)s) (
124 avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
125 )
126 /
127 count by (%(alertmanagerClusterLabels)s) (
128 up{%(alertmanagerSelector)s}
129 )
130 )
131 >= 0.5
132 ||| % $._config,
133 'for': '5m',
134 labels: {
135 severity: 'critical',
136 },
137 annotations: {
138 summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
139 description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
140 },
141 },
142 {
143 alert: 'AlertmanagerClusterCrashlooping',
144 expr: |||
145 (
146 count by (%(alertmanagerClusterLabels)s) (
147 changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
148 )
149 /
150 count by (%(alertmanagerClusterLabels)s) (
151 up{%(alertmanagerSelector)s}
152 )
153 )
154 >= 0.5
155 ||| % $._config,
156 'for': '5m',
157 labels: {
158 severity: 'critical',
159 },
160 annotations: {
161 summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
162 description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
163 },
164 },
165 ],
166 },
167 ],
168 },
169}
View as plain text