From 78fedc0147e31b83b10815f88d68338c1a53a5e3 Mon Sep 17 00:00:00 2001 From: Sergio Maria Matone Date: Thu, 21 Nov 2024 15:38:57 +0100 Subject: [PATCH] chore(otel): Open Telemetry metrics fixed and provided with demo example (#3038) * Redefine and cleanup Open Telemetry metrics types and usage * Rehauled demo example adding a minimal sample of RCP + Validator Node --- gno.land/pkg/sdk/vm/handler.go | 30 -- misc/telemetry/README.md | 35 ++- misc/telemetry/docker-compose.yml | 91 ++++-- misc/telemetry/gnoland/Dockerfile | 13 - misc/telemetry/gnoland/setup.sh | 19 -- .../dashboards}/dashboards.yaml | 2 +- .../dashboards/gno-otel-dashboards.json} | 275 +++++++++--------- .../datasources}/datasources.yaml | 0 misc/telemetry/supernova.Dockerfile | 12 - tm2/pkg/telemetry/config/config.go | 4 +- tm2/pkg/telemetry/metrics/metrics.go | 62 ++-- 11 files changed, 262 insertions(+), 281 deletions(-) delete mode 100644 misc/telemetry/gnoland/Dockerfile delete mode 100644 misc/telemetry/gnoland/setup.sh rename misc/telemetry/grafana/{ => provisioning/dashboards}/dashboards.yaml (66%) rename misc/telemetry/grafana/{gno-dashboards.json => provisioning/dashboards/gno-otel-dashboards.json} (81%) rename misc/telemetry/grafana/{ => provisioning/datasources}/datasources.yaml (100%) delete mode 100644 misc/telemetry/supernova.Dockerfile diff --git a/gno.land/pkg/sdk/vm/handler.go b/gno.land/pkg/sdk/vm/handler.go index 7b26265f35d..c484e07e887 100644 --- a/gno.land/pkg/sdk/vm/handler.go +++ b/gno.land/pkg/sdk/vm/handler.go @@ -1,17 +1,12 @@ package vm import ( - "context" "fmt" "strings" abci "github.com/gnolang/gno/tm2/pkg/bft/abci/types" "github.com/gnolang/gno/tm2/pkg/sdk" "github.com/gnolang/gno/tm2/pkg/std" - "github.com/gnolang/gno/tm2/pkg/telemetry" - "github.com/gnolang/gno/tm2/pkg/telemetry/metrics" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" ) type vmHandler struct { @@ -107,34 +102,9 @@ func (vh vmHandler) Query(ctx sdk.Context, req abci.RequestQuery) abci.ResponseQ secondPart(req.Path), req.Path))) } - // Log the telemetry - logQueryTelemetry(path, res.IsErr()) - return res } -// logQueryTelemetry logs the relevant VM query telemetry -func logQueryTelemetry(path string, isErr bool) { - if !telemetry.MetricsEnabled() { - return - } - - metrics.VMQueryCalls.Add( - context.Background(), - 1, - metric.WithAttributes( - attribute.KeyValue{ - Key: "path", - Value: attribute.StringValue(path), - }, - ), - ) - - if isErr { - metrics.VMQueryErrors.Add(context.Background(), 1) - } -} - // queryPackage fetch a package's files. func (vh vmHandler) queryPackage(ctx sdk.Context, req abci.RequestQuery) (res abci.ResponseQuery) { res.Data = []byte(fmt.Sprintf("TODO: parse parts get or make fileset...")) diff --git a/misc/telemetry/README.md b/misc/telemetry/README.md index 41628cc5f51..e762bd1d630 100644 --- a/misc/telemetry/README.md +++ b/misc/telemetry/README.md @@ -1,4 +1,4 @@ -## Overview +# Open Telemetry overview The purpose of this Telemetry documentation is to showcase the different node metrics exposed by the Gno node through OpenTelemetry, without having to do extraneous setup. @@ -8,9 +8,21 @@ The containerized setup is the following: - Grafana dashboard - Prometheus - OpenTelemetry collector (separate service that needs to run) -- Single Gnoland node, with 1s block times and configured telemetry (enabled) +- 1 RPC Gnoland node, with 1s block times and configured telemetry (enabled) +- 1 Validator Gnoland node, with 1s block times and configured telemetry (enabled) - Supernova process that simulates load periodically (generates network traffic) +## Metrics type + +Metrics collected are defined within codebase at `tm2/pkg/telemetry/metrics/metrics.go`. +They are collected by the OTEL collector who forwards them to Prometheus. + +They are of three different types which can be used in Grafana adding different ypt of suffixes to the metrics name : + +- Histogram ("_sum", "_count", "_bucket"): Collect variations of values along time +- Gauge: Measure a single value at the time it is read +- Counter ("_total"): A value that accumulates over time + ## Starting the containers ### Step 1: Spinning up Docker @@ -18,7 +30,7 @@ The containerized setup is the following: Make sure you have Docker installed and running on your system. After that, within the `misc/telemetry` folder run the following command: -```shell +```bash make up ``` @@ -26,21 +38,14 @@ This will build out the required Docker images for this simulation, and start th ### Step 2: Open Grafana -When you've verified that the `telemetry` containers are up and running, head on over to http://localhost:3000 to open +When you've verified that the `telemetry` containers are up and running, head on over to to open the Grafana dashboard. -Default login details: - -``` -username: admin -password: admin -``` - -After you've logged in (you can skip setting a new password), on the left hand side, click on -`Dashboards -> Gno -> Gno Node Metrics`: +After you've logged in, on the left hand side, click on +`Dashboards -> Gno -> Gno Open Telemetry Metrics`: ![Grafana](assets/grafana-1.jpeg) -This will open up the predefined Gno Metrics dashboards (added for ease of use) : +This will open up the predefined Gno Metrics dashboards (added for ease of use): ![Metrics Dashboard](assets/grafana-2.jpeg) Periodically, these metrics will be updated as the `supernova` process is simulating network traffic. @@ -53,4 +58,4 @@ To stop the cluster, you can run: make down ``` -which will stop the Docker containers. Additionally, you can delete the Docker volumes with `make clean`. \ No newline at end of file +which will stop the Docker containers. Additionally, you can delete the Docker volumes with `make clean`. diff --git a/misc/telemetry/docker-compose.yml b/misc/telemetry/docker-compose.yml index 91c2ea3471d..89c7a924e08 100644 --- a/misc/telemetry/docker-compose.yml +++ b/misc/telemetry/docker-compose.yml @@ -9,6 +9,7 @@ services: - ./collector/collector.yaml:/etc/otelcol-contrib/config.yaml networks: - gnoland-net + prometheus: image: prom/prometheus:latest command: @@ -21,34 +22,90 @@ services: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml networks: - gnoland-net + grafana: - image: grafana/grafana-enterprise + image: grafana/grafana + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin volumes: - grafana_data:/var/lib/grafana - - ./grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml - - ./grafana/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml - - ./grafana/gno-dashboards.json:/var/lib/grafana/dashboards/gno-dashboards.json + - ./grafana/provisioning:/etc/grafana/provisioning ports: - "3000:3000" networks: - gnoland-net - gnoland: - build: - context: ./gnoland - dockerfile: Dockerfile - ports: - - "26657:26657" + + gnoland-val: + image: ghcr.io/gnolang/gno/gnoland:master networks: - gnoland-net + volumes: + # Shared Volume + - gnoland-shared:/gnoroot/shared-data + entrypoint: + - sh + - -c + # Recreate gno genesis from git :( + - | + gnoland secrets init + rm -f /gnoroot/shared-data/node_p2p.id + apk add git make go linux-headers + git clone https://github.com/gnolang/gno.git --single-branch gnoland-src + GOPATH='/usr/' make -C gnoland-src/contribs/gnogenesis/ + gnogenesis generate + gnogenesis validator add -name val000 -address $(gnoland secrets get validator_key.address -raw) -pub-key $(gnoland secrets get validator_key.pub_key -raw) + gnogenesis balances add -balance-sheet /gnoroot/gno.land/genesis/genesis_balances.txt + gnogenesis txs add packages /gnoroot/examples/gno.land + gnoland config init + gnoland config set consensus.timeout_commit 1s + gnoland config set moniker val000 + gnoland config set telemetry.enabled true + gnoland config set telemetry.exporter_endpoint collector:4317 + gnoland config set telemetry.service_instance_id val0 + gnoland secrets get node_id.id -raw > /gnoroot/shared-data/node_p2p.id + cp /gnoroot/genesis.json /gnoroot/shared-data/genesis.json + gnoland start + healthcheck: + test: ["CMD-SHELL", "test -f /gnoroot/shared-data/node_p2p.id || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 60s + + gnoland-rpc: + image: ghcr.io/gnolang/gno/gnoland:master + networks: + - gnoland-net + volumes: + # Shared Volume + - gnoland-shared:/gnoroot/shared-data + entrypoint: + - sh + - -c + - | + gnoland secrets init + gnoland config init + gnoland config set consensus.timeout_commit 1s + gnoland config set moniker rpc0 + gnoland config set rpc.laddr tcp://0.0.0.0:26657 + gnoland config set telemetry.enabled true + gnoland config set telemetry.service_instance_id rpc000 + gnoland config set telemetry.exporter_endpoint collector:4317 + gnoland config set p2p.persistent_peers "$(cat /gnoroot/shared-data/node_p2p.id)@gnoland-val:26656" + gnoland start -genesis /gnoroot/shared-data/genesis.json + depends_on: + gnoland-val: + condition: service_healthy + restart: true + supernova: - build: - dockerfile: supernova.Dockerfile - args: - supernova_version: v1.2.1 + image: ghcr.io/gnolang/supernova:1.3.1 command: > - -sub-accounts 10 -transactions 200 -url http://gnoland:26657 + -sub-accounts 10 -transactions 100 -url http://gnoland-rpc:26657 -mnemonic "source bonus chronic canvas draft south burst lottery vacant surface solve popular case indicate oppose farm nothing bullet exhibit title speed wink action roast" - restart: always + -mode PACKAGE_DEPLOYMENT + restart: unless-stopped networks: - gnoland-net @@ -61,5 +118,5 @@ volumes: driver: local grafana_data: driver: local - gnoland: + gnoland-shared: driver: local diff --git a/misc/telemetry/gnoland/Dockerfile b/misc/telemetry/gnoland/Dockerfile deleted file mode 100644 index c8a89e1a634..00000000000 --- a/misc/telemetry/gnoland/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -# Use the existing gno image as the base image -FROM ghcr.io/gnolang/gno/gnoland:master AS base - -# Copy the setup script into the container -COPY ./setup.sh . - -# Make the script executable -RUN chmod +x ./setup.sh - -# Run the setup -ENTRYPOINT ["sh"] - -CMD ["./setup.sh"] \ No newline at end of file diff --git a/misc/telemetry/gnoland/setup.sh b/misc/telemetry/gnoland/setup.sh deleted file mode 100644 index 12cc418ac67..00000000000 --- a/misc/telemetry/gnoland/setup.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# Initialize the node config -gnoland config init --config-path /gnoroot/gnoland-data/config/config.toml - -# Set the block time to 1s -gnoland config set --config-path /gnoroot/gnoland-data/config/config.toml consensus.timeout_commit 1s - -# Set the listen address -gnoland config set --config-path /gnoroot/gnoland-data/config/config.toml rpc.laddr tcp://0.0.0.0:26657 - -# Enable the metrics -gnoland config set --config-path /gnoroot/gnoland-data/config/config.toml telemetry.enabled true - -# Set the metrics exporter endpoint -gnoland config set --config-path /gnoroot/gnoland-data/config/config.toml telemetry.exporter_endpoint collector:4317 - -# Start the Gnoland node (lazy will init the genesis.json and secrets) -gnoland start --lazy \ No newline at end of file diff --git a/misc/telemetry/grafana/dashboards.yaml b/misc/telemetry/grafana/provisioning/dashboards/dashboards.yaml similarity index 66% rename from misc/telemetry/grafana/dashboards.yaml rename to misc/telemetry/grafana/provisioning/dashboards/dashboards.yaml index 6a70278b8a1..694ea8c2803 100644 --- a/misc/telemetry/grafana/dashboards.yaml +++ b/misc/telemetry/grafana/provisioning/dashboards/dashboards.yaml @@ -5,4 +5,4 @@ providers: folder: Gno type: file options: - path: /var/lib/grafana/dashboards \ No newline at end of file + path: /etc/grafana/provisioning/dashboards diff --git a/misc/telemetry/grafana/gno-dashboards.json b/misc/telemetry/grafana/provisioning/dashboards/gno-otel-dashboards.json similarity index 81% rename from misc/telemetry/grafana/gno-dashboards.json rename to misc/telemetry/grafana/provisioning/dashboards/gno-otel-dashboards.json index 58373bfb1ee..55880699b7f 100644 --- a/misc/telemetry/grafana/gno-dashboards.json +++ b/misc/telemetry/grafana/provisioning/dashboards/gno-otel-dashboards.json @@ -19,7 +19,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2, "links": [], "panels": [ { @@ -37,13 +36,14 @@ }, { "datasource": { + "default": true, "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "mappings": [], "thresholds": { @@ -64,24 +64,25 @@ "x": 0, "y": 1 }, - "id": 24, + "id": 26, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], "fields": "", "values": false }, - "showPercentChange": false, + "showPercentChange": true, "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -89,14 +90,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "vm_query_errors_counter", + "expr": "rate(vm_gas_used_hist_sum{exported_instance=~\"${node}\"}[$__rate_interval])/rate(vm_gas_used_hist_count{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, - "legendFormat": "__auto", + "legendFormat": "{{operation}}", "range": true, "refId": "A" } ], - "title": "Total Number of VM Query Errors", + "title": "Average Gas Used by VM execution - Node: ${node}", "type": "stat" }, { @@ -107,7 +108,7 @@ "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "mappings": [], "thresholds": { @@ -128,76 +129,13 @@ "x": 12, "y": 1 }, - "id": 25, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.4.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "vm_query_calls_counter", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Total Number of VM Query Calls", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 26, + "id": 27, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -205,11 +143,11 @@ "fields": "", "values": false }, - "showPercentChange": false, + "showPercentChange": true, "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -217,25 +155,26 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vm_gas_used_hist_sum[10m])/rate(vm_gas_used_hist_count[10m])", + "expr": "rate(vm_cpu_cycles_hist_sum{exported_instance=~\"${node}\"}[$__rate_interval])/rate(vm_cpu_cycles_hist_count{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, - "legendFormat": "__auto", + "legendFormat": "{{operation}}", "range": true, "refId": "A" } ], - "title": "Average Gas Used by VM execution [10min]", + "title": "Average CPU Cycles in VM execution - Node: ${node}", "type": "stat" }, { "datasource": { + "default": true, "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "mappings": [], "thresholds": { @@ -253,27 +192,28 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 6, "y": 9 }, - "id": 27, + "id": 24, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "mean" + "lastNotNull" ], "fields": "", "values": false }, - "showPercentChange": false, + "showPercentChange": true, "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -281,14 +221,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vm_cpu_cycles_hist_sum[10m])/rate(vm_cpu_cycles_hist_count[10m])", + "expr": "vm_exec_msg_counter_total{exported_instance=~\"${node}\"}", "instant": false, - "legendFormat": "__auto", + "legendFormat": "{{operation}}", "range": true, "refId": "A" } ], - "title": "Average CPU Cycles in VM execution [10min]", + "title": "Total Number of VM Exec Msg - Node: ${node}", "type": "stat" }, { @@ -337,6 +277,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -348,7 +289,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -356,7 +297,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(num_mempool_txs_hist_sum[10m])/rate(num_mempool_txs_count[10m])", + "expr": "sum(rate(num_mempool_txs_hist_sum[$__rate_interval]))/sum(rate(num_mempool_txs_hist_count[$__rate_interval]))", "instant": false, "legendFormat": "__auto", "range": true, @@ -401,6 +342,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -412,7 +354,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -420,7 +362,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(num_cached_txs_hist_sum[10m])/rate(num_cached_txs_count[10m])", + "expr": "sum(rate(num_cached_txs_hist_sum[$__rate_interval]))/sum(rate(num_cached_txs_hist_count[$__rate_interval]))", "instant": false, "legendFormat": "__auto", "range": true, @@ -476,9 +418,10 @@ "id": 12, "options": { "colorMode": "value", - "graphMode": "area", + "graphMode": "none", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -490,7 +433,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -498,14 +441,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(inbound_peers_hist_sum[10m])/rate(inbound_peers_hist_count[10m])", + "expr": "avg_over_time(inbound_peers_gauge{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Average Inbound Peer Count", + "title": "Average Inbound Peer Count - Node: ${node}", "type": "stat" }, { @@ -542,9 +485,10 @@ "id": 13, "options": { "colorMode": "value", - "graphMode": "area", + "graphMode": "none", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -556,7 +500,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -564,14 +508,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(outbound_peers_hist_sum[10m])/rate(outbound_peers_hist_count[10m])", + "expr": "avg_over_time(outbound_peers_gauge{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Average Outbound Peer Count", + "title": "Average Outbound Peer Count - Node: ${node}", "type": "stat" }, { @@ -608,9 +552,10 @@ "id": 14, "options": { "colorMode": "value", - "graphMode": "area", + "graphMode": "none", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -622,7 +567,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -630,14 +575,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(dialing_peers_hist_sum[10m])/rate(dialing_peers_hist_count[10m])", + "expr": "avg_over_time(dialing_peers_gauge{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Average Dialing Peer Count", + "title": "Average Dialing Peer Count - Node: ${node}", "type": "stat" }, { @@ -687,6 +632,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -698,7 +644,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -706,7 +652,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(validator_count_hist_sum[10m])/rate(validator_count_hist_count[10m])", + "expr": "rate(validator_count_hist_sum{exported_instance=~\"${node}\"}[$__rate_interval])/rate(validator_count_hist_count{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, "legendFormat": "__auto", "range": true, @@ -752,6 +698,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -763,7 +710,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -771,7 +718,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(validator_vp_hist_sum[10m])/rate(validator_vp_hist_count[10m])", + "expr": "rate(validator_vp_hist_sum{exported_instance=~\"${node}\"}[$__rate_interval])/rate(validator_vp_hist_count{exported_instance=~\"${node}\"}[$__rate_interval])", "instant": false, "legendFormat": "__auto", "range": true, @@ -825,6 +772,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -836,7 +784,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -845,7 +793,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(build_block_hist_milliseconds_sum[10m])/rate(build_block_hist_milliseconds_count[10m])", + "expr": "rate(build_block_hist_milliseconds_sum[$__rate_interval])/rate(build_block_hist_milliseconds_count[$__rate_interval])", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -855,7 +803,7 @@ "useBackend": false } ], - "title": "Average Block Build Time [10min]", + "title": "Average Block Build Time", "type": "stat" }, { @@ -897,6 +845,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -908,7 +857,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -917,7 +866,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(block_interval_hist_seconds_sum[10m])/rate(block_interval_hist_seconds_count[10m])", + "expr": "sum(rate(block_interval_hist_seconds_sum[$__rate_interval]))/sum(rate(block_interval_hist_seconds_count[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -927,7 +876,7 @@ "useBackend": false } ], - "title": "Average Block Interval [10min]", + "title": "Average Block Interval", "type": "stat" }, { @@ -991,7 +940,7 @@ "reverse": false } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1010,7 +959,7 @@ "useBackend": false } ], - "title": "Average Block Tx Count [10min]", + "title": "Average Block Tx Count", "type": "heatmap" }, { @@ -1049,6 +998,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -1060,7 +1010,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1069,7 +1019,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(block_size_hist_B_sum[10m])/rate(block_size_hist_B_count[10m])", + "expr": "sum(rate(block_size_hist_B_sum[$__rate_interval]))/sum(rate(block_size_hist_B_count[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1079,7 +1029,7 @@ "useBackend": false } ], - "title": "Average Block Size [10min]", + "title": "Average Block Size", "type": "stat" }, { @@ -1136,6 +1086,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -1147,7 +1098,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1156,7 +1107,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(broadcast_tx_hist_milliseconds_sum[10m])/rate(broadcast_tx_hist_milliseconds_count[10m])", + "expr": "sum(rate(broadcast_tx_hist_milliseconds_sum[$__rate_interval]))/sum(rate(broadcast_tx_hist_milliseconds_count[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1166,7 +1117,7 @@ "useBackend": false } ], - "title": "Average Transaction Broadcast Duration [10min]", + "title": "Average Transaction Broadcast Duration", "type": "stat" }, { @@ -1227,6 +1178,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -1238,7 +1190,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1247,7 +1199,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(http_request_time_hist_milliseconds_sum[10m])/rate(http_request_time_hist_milliseconds_count[10m])", + "expr": "rate(http_request_time_hist_milliseconds_sum[$__rate_interval])/rate(http_request_time_hist_milliseconds_count[$__rate_interval])", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -1258,7 +1210,7 @@ "useBackend": false } ], - "title": "Average HTTP Request Round Trip Time [10min]", + "title": "Average HTTP Request Round Trip Time", "type": "stat" }, { @@ -1306,6 +1258,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -1317,7 +1270,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1326,7 +1279,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(ws_request_time_hist_milliseconds_sum[10m])/rate(ws_request_time_hist_milliseconds_count[10m])", + "expr": "rate(ws_request_time_hist_milliseconds_sum[$__rate_interval])/rate(ws_request_time_hist_milliseconds_count[$__rate_interval])", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -1337,7 +1290,7 @@ "useBackend": false } ], - "title": "Average WS Request Round Trip Time [10min]", + "title": "Average WS Request Round Trip Time", "type": "stat" } ], @@ -1345,16 +1298,72 @@ "schemaVersion": 39, "tags": [], "templating": { - "list": [] + "list": [ + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(exported_instance)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(exported_instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "m_addpkg", + "value": "m_addpkg" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(vm_gas_used_hist_sum,operation)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "operation", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(vm_gas_used_hist_sum,operation)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { - "from": "now-6h", + "from": "now-8h", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "Gno Node Metrics", + "title": "Gno Open Telemetry Metrics", "uid": "bdl7d5yogxjb4b", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/misc/telemetry/grafana/datasources.yaml b/misc/telemetry/grafana/provisioning/datasources/datasources.yaml similarity index 100% rename from misc/telemetry/grafana/datasources.yaml rename to misc/telemetry/grafana/provisioning/datasources/datasources.yaml diff --git a/misc/telemetry/supernova.Dockerfile b/misc/telemetry/supernova.Dockerfile deleted file mode 100644 index 67ccbda8047..00000000000 --- a/misc/telemetry/supernova.Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM golang:1.22-alpine - -ARG supernova_version=latest - -RUN go install github.com/gnolang/supernova/cmd@$supernova_version && mv /go/bin/cmd /go/bin/supernova -RUN export SUPERNOVA_PATH=$(go list -m -f "{{.Dir}}" github.com/gnolang/supernova@${supernova_version}) && \ - mkdir -p /supernova && \ - cp -r $SUPERNOVA_PATH/* /supernova - -WORKDIR /supernova - -ENTRYPOINT ["supernova"] diff --git a/tm2/pkg/telemetry/config/config.go b/tm2/pkg/telemetry/config/config.go index 47fc5666342..d11eba15016 100644 --- a/tm2/pkg/telemetry/config/config.go +++ b/tm2/pkg/telemetry/config/config.go @@ -10,8 +10,8 @@ var errEndpointNotSet = errors.New("telemetry exporter endpoint not set") type Config struct { MetricsEnabled bool `json:"enabled" toml:"enabled"` MeterName string `json:"meter_name" toml:"meter_name"` - ServiceName string `json:"service_name" toml:"service_name"` - ServiceInstanceID string `json:"service_instance_id" toml:"service_instance_id" comment:"the ID helps to distinguish instances of the same service that exist at the same time (e.g. instances of a horizontally scaled service)"` + ServiceName string `json:"service_name" toml:"service_name" comment:"in Prometheus this is transformed into the label 'exported_job'"` + ServiceInstanceID string `json:"service_instance_id" toml:"service_instance_id" comment:"the ID helps to distinguish instances of the same service that exist at the same time (e.g. instances of a horizontally scaled service), in Prometheus this is transformed into the label 'exported_instance"` ExporterEndpoint string `json:"exporter_endpoint" toml:"exporter_endpoint" comment:"the endpoint to export metrics to, like a local OpenTelemetry collector"` } diff --git a/tm2/pkg/telemetry/metrics/metrics.go b/tm2/pkg/telemetry/metrics/metrics.go index 2b04769fe0c..7a3e182e06d 100644 --- a/tm2/pkg/telemetry/metrics/metrics.go +++ b/tm2/pkg/telemetry/metrics/metrics.go @@ -19,18 +19,16 @@ const ( broadcastTxTimerKey = "broadcast_tx_hist" buildBlockTimerKey = "build_block_hist" - inboundPeersKey = "inbound_peers_hist" - outboundPeersKey = "outbound_peers_hist" - dialingPeersKey = "dialing_peers_hist" + inboundPeersKey = "inbound_peers_gauge" + outboundPeersKey = "outbound_peers_gauge" + dialingPeersKey = "dialing_peers_gauge" numMempoolTxsKey = "num_mempool_txs_hist" numCachedTxsKey = "num_cached_txs_hist" - vmQueryCallsKey = "vm_query_calls_counter" - vmQueryErrorsKey = "vm_query_errors_counter" - vmGasUsedKey = "vm_gas_used_hist" - vmCPUCyclesKey = "vm_cpu_cycles_hist" - vmExecMsgKey = "vm_exec_msg_hist" + vmExecMsgKey = "vm_exec_msg_counter" + vmGasUsedKey = "vm_gas_used_hist" + vmCPUCyclesKey = "vm_cpu_cycles_hist" validatorCountKey = "validator_count_hist" validatorVotingPowerKey = "validator_vp_hist" @@ -51,13 +49,13 @@ var ( // Networking // // InboundPeers measures the active number of inbound peers - InboundPeers metric.Int64Histogram + InboundPeers metric.Int64Gauge // OutboundPeers measures the active number of outbound peers - OutboundPeers metric.Int64Histogram + OutboundPeers metric.Int64Gauge // DialingPeers measures the active number of peers in the dialing state - DialingPeers metric.Int64Histogram + DialingPeers metric.Int64Gauge // Mempool // @@ -69,11 +67,8 @@ var ( // Runtime // - // VMQueryCalls measures the frequency of VM query calls - VMQueryCalls metric.Int64Counter - - // VMQueryErrors measures the frequency of VM query errors - VMQueryErrors metric.Int64Counter + // VMExecMsgFrequency measures the frequency of VM operations + VMExecMsgFrequency metric.Int64Counter // VMGasUsed measures the VM gas usage VMGasUsed metric.Int64Histogram @@ -81,9 +76,6 @@ var ( // VMCPUCycles measures the VM CPU cycles VMCPUCycles metric.Int64Histogram - // VMExecMsgFrequency measures the frequency of VM operations - VMExecMsgFrequency metric.Int64Counter - // Consensus // // BuildBlockTimer measures the block build duration @@ -177,26 +169,32 @@ func Init(config config.Config) error { } // Networking // - if InboundPeers, err = meter.Int64Histogram( + if InboundPeers, err = meter.Int64Gauge( inboundPeersKey, metric.WithDescription("inbound peer count"), ); err != nil { return fmt.Errorf("unable to create histogram, %w", err) } + // Initialize InboundPeers Gauge + InboundPeers.Record(ctx, 0) - if OutboundPeers, err = meter.Int64Histogram( + if OutboundPeers, err = meter.Int64Gauge( outboundPeersKey, metric.WithDescription("outbound peer count"), ); err != nil { return fmt.Errorf("unable to create histogram, %w", err) } + // Initialize OutboundPeers Gauge + OutboundPeers.Record(ctx, 0) - if DialingPeers, err = meter.Int64Histogram( + if DialingPeers, err = meter.Int64Gauge( dialingPeersKey, metric.WithDescription("dialing peer count"), ); err != nil { return fmt.Errorf("unable to create histogram, %w", err) } + // Initialize DialingPeers Gauge + DialingPeers.Record(ctx, 0) // Mempool // if NumMempoolTxs, err = meter.Int64Histogram( @@ -214,16 +212,9 @@ func Init(config config.Config) error { } // Runtime // - if VMQueryCalls, err = meter.Int64Counter( - vmQueryCallsKey, - metric.WithDescription("vm query call frequency"), - ); err != nil { - return fmt.Errorf("unable to create counter, %w", err) - } - - if VMQueryErrors, err = meter.Int64Counter( - vmQueryErrorsKey, - metric.WithDescription("vm query errors call frequency"), + if VMExecMsgFrequency, err = meter.Int64Counter( + vmExecMsgKey, + metric.WithDescription("vm msg operation call frequency"), ); err != nil { return fmt.Errorf("unable to create counter, %w", err) } @@ -242,13 +233,6 @@ func Init(config config.Config) error { return fmt.Errorf("unable to create histogram, %w", err) } - if VMExecMsgFrequency, err = meter.Int64Counter( - vmExecMsgKey, - metric.WithDescription("vm msg operation call frequency"), - ); err != nil { - return fmt.Errorf("unable to create counter, %w", err) - } - // Consensus // if ValidatorsCount, err = meter.Int64Histogram( validatorCountKey,