-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathavailability_metric_integration_service.yaml
55 lines (48 loc) · 2.81 KB
/
availability_metric_integration_service.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
evaluation_interval: 1m
rule_files:
- prometheus.availability_metric_integration_service_alerts.yaml
tests:
# Test 1: Availability drops below 99% for more than 10 minutes on cluster01, triggering an alert.
- interval: 1m
input_series:
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
values: '1x1380 0x60' # 23 hours uptime, followed by 1 hour of downtime.
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
values: '1x1440' # Cluster02 remains up for the entire 24-hour period.
alert_rule_test:
- eval_time: 1440m # Test after a full 24-hour window.
alertname: IntegrationServiceAvailabilitySLOViolation
exp_alerts:
- exp_labels:
severity: high
slo: "false"
source_cluster: cluster01
exp_annotations:
summary: Integration Service Availability SLO Violation
description: >
Integration Service GitHub App availability has dropped below 99% for more than 10 minutes on cluster cluster01.
alert_team_handle: <!subteam^S05M4AG8CJH>
team: integration
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/integration-service/sre/integration_service_availability.md
# Test 2: Both clusters maintain availability above 99%, no alerts are triggered.
- interval: 1m
input_series:
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
values: '1x1440' # Cluster01 has full uptime.
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
values: '1x1440' # Cluster02 also has full uptime.
alert_rule_test:
- eval_time: 1440m # Test after full 24-hour window.
alertname: IntegrationServiceAvailabilitySLOViolation
exp_alerts: [] # No alerts expected as both clusters maintain 100% uptime.
# Test 3: Cluster01 has brief downtime but does not trigger an alert as availability remains above 99% over 24 hours.
- interval: 1m
input_series:
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
values: '1x1425 0x15' # 23 hours 45 minutes uptime, followed by 15 minutes downtime.
- series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
values: '1x1440' # Cluster02 remains up for the entire period.
alert_rule_test:
- eval_time: 1440m # Test after a full 24-hour window.
alertname: IntegrationServiceAvailabilitySLOViolation
exp_alerts: [] # No alerts expected, as downtime does not bring availability below 99%.