test/promql/tests/data_plane/availability_metric_integration_service.yaml

evaluation_interval: 1m

rule_files:
  - prometheus.availability_metric_integration_service_alerts.yaml

tests:
  # Test 1: Availability drops below 99% for more than 10 minutes on cluster01, triggering an alert.
  - interval: 1m
    input_series:
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
        values: '1x1380 0x60'  # 23 hours uptime, followed by 1 hour of downtime.
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
        values: '1x1440'  # Cluster02 remains up for the entire 24-hour period.

    alert_rule_test:
      - eval_time: 1440m  # Test after a full 24-hour window.
        alertname: IntegrationServiceAvailabilitySLOViolation
        exp_alerts:
          - exp_labels:
              severity: high
              slo: "false"
              source_cluster: cluster01
            exp_annotations:
              summary: Integration Service Availability SLO Violation
              description: >
                Integration Service GitHub App availability has dropped below 99% for more than 10 minutes on cluster cluster01.
              alert_team_handle: <!subteam^S05M4AG8CJH>
              team: integration
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/integration-service/sre/integration_service_availability.md

  # Test 2: Both clusters maintain availability above 99%, no alerts are triggered.
  - interval: 1m
    input_series:
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
        values: '1x1440'  # Cluster01 has full uptime.
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
        values: '1x1440'  # Cluster02 also has full uptime.

    alert_rule_test:
      - eval_time: 1440m  # Test after full 24-hour window.
        alertname: IntegrationServiceAvailabilitySLOViolation
        exp_alerts: []  # No alerts expected as both clusters maintain 100% uptime.

  # Test 3: Cluster01 has brief downtime but does not trigger an alert as availability remains above 99% over 24 hours.
  - interval: 1m
    input_series:
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster01"}'
        values: '1x1425 0x15'  # 23 hours 45 minutes uptime, followed by 15 minutes downtime.
      - series: 'redhat_appstudio_integrationservice_global_github_app_available{source_cluster="cluster02"}'
        values: '1x1440'  # Cluster02 remains up for the entire period.

    alert_rule_test:
      - eval_time: 1440m  # Test after a full 24-hour window.
        alertname: IntegrationServiceAvailabilitySLOViolation
        exp_alerts: []  # No alerts expected, as downtime does not bring availability below 99%.