Skip to content

Commit

Permalink
Asserting when reconciliation loop exceeds certain threshold (#1078)
Browse files Browse the repository at this point in the history
* Asserting when reconciliation loop exceeds certain threshold

Signed-off-by: Ruben Vargas <[email protected]>

* add metrics assertion to github action

Signed-off-by: Ruben Vargas <[email protected]>

* remove skip delete from metrics tasks

Signed-off-by: Ruben Vargas <[email protected]>

* test to make it fail

Signed-off-by: Ruben Vargas <[email protected]>

* add tail log when assert fail

Signed-off-by: Ruben Vargas <[email protected]>

* fix pod selector for logs

Signed-off-by: Ruben Vargas <[email protected]>

* good thresholds

Signed-off-by: Ruben Vargas <[email protected]>

* good thresholds

Signed-off-by: Ruben Vargas <[email protected]>

* high threshold

Signed-off-by: Ruben Vargas <[email protected]>

---------

Signed-off-by: Ruben Vargas <[email protected]>
  • Loading branch information
rubenvp8510 authored Nov 4, 2024
1 parent c43f232 commit e5a8508
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ jobs:
- name: "run tests"
env:
KUBE_VERSION: ${{ matrix.kube-version }}
run: make prepare-e2e e2e KUBE_VERSION=$KUBE_VERSION
run: make prepare-e2e e2e test-operator-metrics KUBE_VERSION=$KUBE_VERSION


upgrade-tests:
name: Upgrade tests ${{ matrix.name }}
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ prepare-e2e: chainsaw start-kind cert-manager set-test-image-vars build docker-b
e2e:
$(CHAINSAW) test --test-dir ./tests/e2e

.PHONY: test-operator-metrics
test-operator-metrics:
$(CHAINSAW) test --test-dir ./tests/operator-metrics

# OpenShift end-to-tests
.PHONY: e2e-openshift
e2e-openshift:
Expand Down
10 changes: 10 additions & 0 deletions tests/operator-metrics/max-loops/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: sa-assert-metrics
namespace: tempo-operator-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sa-assert-metrics-role-binding
18 changes: 18 additions & 0 deletions tests/operator-metrics/max-loops/00-metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: sa-assert-metrics
namespace: tempo-operator-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sa-assert-metrics-role-binding
subjects:
- kind: ServiceAccount
name: sa-assert-metrics
namespace: tempo-operator-system
roleRef:
kind: ClusterRole
name: tempo-operator-metrics-reader
apiGroup: rbac.authorization.k8s.io
9 changes: 9 additions & 0 deletions tests/operator-metrics/max-loops/01-assert-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-metrics
namespace: tempo-operator-system
status:
conditions:
- status: "True"
type: Complete
62 changes: 62 additions & 0 deletions tests/operator-metrics/max-loops/01-verify-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-metrics
namespace: tempo-operator-system
spec:
template:
spec:
containers:
- name: verify-metrics
image: ghcr.io/grafana/tempo-operator/test-utils:main
env:
- name: TEMPOMONOLITHIC_THRESHOLD
value: "1000"
- name: TEMPOSTACK_THRESHOLD
value: "1000"
command:
- /bin/bash
- -eux
- -c
args:
- |
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
curl -k -H "Authorization: Bearer $TOKEN" -G https://tempo-operator-controller-manager-metrics-service:8443/metrics -o /tmp/tempo.out
# Define thresholds
declare -A SUCCESS_THRESHOLDS=(
["tempomonolithic"]=${TEMPOMONOLITHIC_THRESHOLD:-0}
["tempostack"]=${TEMPOSTACK_THRESHOLD:-0}
)
# Initialize counters
declare -A success_counts=()
while IFS= read -r line; do
if [[ $line =~ ^controller_runtime_reconcile_total ]]; then
echo $line
controller=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[1], b, "="); gsub(/"/, "", b[2]); print b[2]}')
result=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[2], b, "="); gsub(/"/, "", b[2]); print b[2]}')
value=$(echo "$line" | awk '{print $NF}')
if [[ $result == "success" ]]; then
success_counts["$controller"]=$value
fi
fi
done < /tmp/tempo.out
# Validate counts against thresholds
for controller in "${!SUCCESS_THRESHOLDS[@]}"; do
success_count=${success_counts["$controller"]:-0}
if (( success_count > SUCCESS_THRESHOLDS["$controller"] )); then
echo "Alert: Success count for $controller ($success_count) exceeds threshold (${SUCCESS_THRESHOLDS["$controller"]})."
exit 1
fi
done
# Print all success counts at the end
echo "All metrics are within acceptable limits."
echo "Success counts:"
declare -p success_counts
restartPolicy: Never
serviceAccountName: sa-assert-metrics
27 changes: 27 additions & 0 deletions tests/operator-metrics/max-loops/chainsaw-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
creationTimestamp: null
name: operator-metrics
spec:
steps:
- name: step-00
try:
- apply:
file: 00-metrics-service.yaml
- assert:
file: 00-assert.yaml

- name: step-01
try:
- apply:
file: 01-verify-metrics.yaml
- assert:
file: 01-assert-job.yaml
catch:
- events: {}
- podLogs:
selector: job-name=verify-metrics
namespace: tempo-operator-system
tail: 10

0 comments on commit e5a8508

Please sign in to comment.