From e5a85084bff093b7539f202bae7282afaf52c85c Mon Sep 17 00:00:00 2001 From: Ruben Vargas Date: Mon, 4 Nov 2024 10:37:44 -0600 Subject: [PATCH] Asserting when reconciliation loop exceeds certain threshold (#1078) * Asserting when reconciliation loop exceeds certain threshold Signed-off-by: Ruben Vargas * add metrics assertion to github action Signed-off-by: Ruben Vargas * remove skip delete from metrics tasks Signed-off-by: Ruben Vargas * test to make it fail Signed-off-by: Ruben Vargas * add tail log when assert fail Signed-off-by: Ruben Vargas * fix pod selector for logs Signed-off-by: Ruben Vargas * good thresholds Signed-off-by: Ruben Vargas * good thresholds Signed-off-by: Ruben Vargas * high threshold Signed-off-by: Ruben Vargas --------- Signed-off-by: Ruben Vargas --- .github/workflows/e2e.yaml | 3 +- Makefile | 4 ++ .../operator-metrics/max-loops/00-assert.yaml | 10 +++ .../max-loops/00-metrics-service.yaml | 18 ++++++ .../max-loops/01-assert-job.yaml | 9 +++ .../max-loops/01-verify-metrics.yaml | 62 +++++++++++++++++++ .../max-loops/chainsaw-test.yaml | 27 ++++++++ 7 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tests/operator-metrics/max-loops/00-assert.yaml create mode 100644 tests/operator-metrics/max-loops/00-metrics-service.yaml create mode 100644 tests/operator-metrics/max-loops/01-assert-job.yaml create mode 100644 tests/operator-metrics/max-loops/01-verify-metrics.yaml create mode 100755 tests/operator-metrics/max-loops/chainsaw-test.yaml diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 07918cfba..c28e32dbc 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -42,7 +42,8 @@ jobs: - name: "run tests" env: KUBE_VERSION: ${{ matrix.kube-version }} - run: make prepare-e2e e2e KUBE_VERSION=$KUBE_VERSION + run: make prepare-e2e e2e test-operator-metrics KUBE_VERSION=$KUBE_VERSION + upgrade-tests: name: Upgrade tests ${{ matrix.name }} diff --git a/Makefile b/Makefile index 4cf6707d0..bbdea9450 100644 --- a/Makefile +++ b/Makefile @@ -387,6 +387,10 @@ prepare-e2e: chainsaw start-kind cert-manager set-test-image-vars build docker-b e2e: $(CHAINSAW) test --test-dir ./tests/e2e +.PHONY: test-operator-metrics +test-operator-metrics: + $(CHAINSAW) test --test-dir ./tests/operator-metrics + # OpenShift end-to-tests .PHONY: e2e-openshift e2e-openshift: diff --git a/tests/operator-metrics/max-loops/00-assert.yaml b/tests/operator-metrics/max-loops/00-assert.yaml new file mode 100644 index 000000000..36cb7a1f4 --- /dev/null +++ b/tests/operator-metrics/max-loops/00-assert.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sa-assert-metrics + namespace: tempo-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sa-assert-metrics-role-binding diff --git a/tests/operator-metrics/max-loops/00-metrics-service.yaml b/tests/operator-metrics/max-loops/00-metrics-service.yaml new file mode 100644 index 000000000..c927dccfd --- /dev/null +++ b/tests/operator-metrics/max-loops/00-metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sa-assert-metrics + namespace: tempo-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sa-assert-metrics-role-binding +subjects: + - kind: ServiceAccount + name: sa-assert-metrics + namespace: tempo-operator-system +roleRef: + kind: ClusterRole + name: tempo-operator-metrics-reader + apiGroup: rbac.authorization.k8s.io diff --git a/tests/operator-metrics/max-loops/01-assert-job.yaml b/tests/operator-metrics/max-loops/01-assert-job.yaml new file mode 100644 index 000000000..be9cd95da --- /dev/null +++ b/tests/operator-metrics/max-loops/01-assert-job.yaml @@ -0,0 +1,9 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: verify-metrics + namespace: tempo-operator-system +status: + conditions: + - status: "True" + type: Complete diff --git a/tests/operator-metrics/max-loops/01-verify-metrics.yaml b/tests/operator-metrics/max-loops/01-verify-metrics.yaml new file mode 100644 index 000000000..08fea9a3d --- /dev/null +++ b/tests/operator-metrics/max-loops/01-verify-metrics.yaml @@ -0,0 +1,62 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: verify-metrics + namespace: tempo-operator-system +spec: + template: + spec: + containers: + - name: verify-metrics + image: ghcr.io/grafana/tempo-operator/test-utils:main + env: + - name: TEMPOMONOLITHIC_THRESHOLD + value: "1000" + - name: TEMPOSTACK_THRESHOLD + value: "1000" + command: + - /bin/bash + - -eux + - -c + args: + - | + TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + curl -k -H "Authorization: Bearer $TOKEN" -G https://tempo-operator-controller-manager-metrics-service:8443/metrics -o /tmp/tempo.out + + # Define thresholds + declare -A SUCCESS_THRESHOLDS=( + ["tempomonolithic"]=${TEMPOMONOLITHIC_THRESHOLD:-0} + ["tempostack"]=${TEMPOSTACK_THRESHOLD:-0} + ) + + # Initialize counters + declare -A success_counts=() + + + while IFS= read -r line; do + if [[ $line =~ ^controller_runtime_reconcile_total ]]; then + echo $line + controller=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[1], b, "="); gsub(/"/, "", b[2]); print b[2]}') + result=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[2], b, "="); gsub(/"/, "", b[2]); print b[2]}') + value=$(echo "$line" | awk '{print $NF}') + if [[ $result == "success" ]]; then + success_counts["$controller"]=$value + fi + fi + done < /tmp/tempo.out + + # Validate counts against thresholds + for controller in "${!SUCCESS_THRESHOLDS[@]}"; do + success_count=${success_counts["$controller"]:-0} + if (( success_count > SUCCESS_THRESHOLDS["$controller"] )); then + echo "Alert: Success count for $controller ($success_count) exceeds threshold (${SUCCESS_THRESHOLDS["$controller"]})." + exit 1 + fi + done + + # Print all success counts at the end + echo "All metrics are within acceptable limits." + echo "Success counts:" + declare -p success_counts + restartPolicy: Never + serviceAccountName: sa-assert-metrics diff --git a/tests/operator-metrics/max-loops/chainsaw-test.yaml b/tests/operator-metrics/max-loops/chainsaw-test.yaml new file mode 100755 index 000000000..73c2d2c45 --- /dev/null +++ b/tests/operator-metrics/max-loops/chainsaw-test.yaml @@ -0,0 +1,27 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + creationTimestamp: null + name: operator-metrics +spec: + steps: + - name: step-00 + try: + - apply: + file: 00-metrics-service.yaml + - assert: + file: 00-assert.yaml + + - name: step-01 + try: + - apply: + file: 01-verify-metrics.yaml + - assert: + file: 01-assert-job.yaml + catch: + - events: {} + - podLogs: + selector: job-name=verify-metrics + namespace: tempo-operator-system + tail: 10