diff --git a/cmd/main.go b/cmd/main.go index b33fffdd..3138d66a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -27,60 +27,61 @@ import ( "gopkg.in/src-d/go-git.v4/config" "gopkg.in/src-d/go-git.v4/plumbing" "gopkg.in/src-d/go-git.v4/storage/memory" + yaml "gopkg.in/yaml.v2" "io/ioutil" "net/http" "os" "path/filepath" "reflect" "strings" - yaml "gopkg.in/yaml.v2" "time" ) const ( // expect_basic_file_size is used to check file number in auto generated directory. - expect_basic_file_size = 17 - ALERT_FOR_CONFIG = "5m" + expect_basic_file_size = 17 + ALERT_FOR_CONFIG = "5m" ) var ( - lowest_version string - repository_url string - baseDir string + lowest_version string + repository_url string + baseDir string datasource_name = "tidb-cluster" //dashboards = []string{"binlog.json", "tidb.json", "overview.json", "tikv_details.json", "tikv_summary.json", "tikv_trouble_shooting.json", "pd.json", "tikv_pull.json"} dashboards = map[string]string{ - "binlog.json": "Test-Cluster-Binlog", - "tidb.json": "Test-Cluster-TiDB", - "overview.json": "Test-Cluster-Overview", - "tikv_details.json": "Test-Cluster-TiKV-Details", - "tikv_summary.json": "Test-Cluster-TiKV-Summary", + "binlog.json": "Test-Cluster-Binlog", + "tidb.json": "Test-Cluster-TiDB", + "overview.json": "Test-Cluster-Overview", + "tikv_details.json": "Test-Cluster-TiKV-Details", + "tikv_summary.json": "Test-Cluster-TiKV-Summary", "tikv_trouble_shooting.json": "Test-Cluster-TiKV-Trouble-Shooting", - "pd.json": "Test-Cluster-PD", - "tikv_pull.json": "Test-Cluster-TiKV", - "overview_pull.json": "Test-Cluster-Overview", - "lightning.json": "Test-Cluster-Lightning", + "pd.json": "Test-Cluster-PD", + "tikv_pull.json": "Test-Cluster-TiKV", + "overview_pull.json": "Test-Cluster-Overview", + "lightning.json": "Test-Cluster-Lightning", } - rules = []string{"tidb.rules.yml", "pd.rules.yml", "tikv-pull.rules.yml", "tikv.rules.yml", "binlog.rules.yml", "lightning.rules.yml"} + rules = []string{"tidb.rules.yml", "pd.rules.yml", "tikv-pull.rules.yml", "tikv.rules.yml", "binlog.rules.yml", "lightning.rules.yml"} overviewExlcudeItems = []string{"Services Port Status", "System Info"} - tikvExcludeItems = []string{"IO utilization"} + tikvExcludeItems = []string{"IO utilization"} //dockerfiles = []string{"Dockerfile", "init.sh"} - localFiles = map[string]string { - "datasource/k8s-datasource.yaml": "datasources", + localFiles = map[string]string{ + "datasource/k8s-datasource.yaml": "datasources", "datasource/tidb-cluster-datasource.yaml": "datasources", - "dashboards/pods/pods.json": "dashboards", - "dashboards/nodes/nodes.json": "dashboards", - "Dockerfile": ".", - "init.sh": ".", + "dashboards/pods/pods.json": "dashboards", + "dashboards/nodes/nodes.json": "dashboards", + "Dockerfile": ".", + "init.sh": ".", } - needToReplaceExpr = map[string]string { - strings.ToUpper("pd_cluster_low_space"): `(sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)`, + needToReplaceExpr = map[string]string{ + strings.ToUpper("pd_cluster_low_space"): `(sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)`, strings.ToUpper("pd_cluster_lost_connect_tikv_nums"): `(sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)`, - strings.ToUpper("pd_pending_peer_region_count"): `(sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0)`, + strings.ToUpper("pd_pending_peer_region_count"): `(sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0)`, + strings.ToUpper("pd_miss_peer_region_count"): `(sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0)`, } forConfig, configerr = model.ParseDuration(ALERT_FOR_CONFIG) @@ -95,8 +96,8 @@ func main() { }, } - rootCmd.Flags().StringVar(&baseDir,"path", ".", "the base directory of the program") - rootCmd.Flags().StringVar( &lowest_version,"lowest-version", "2.1.8", "the lowest tidb version") + rootCmd.Flags().StringVar(&baseDir, "path", ".", "the base directory of the program") + rootCmd.Flags().StringVar(&lowest_version, "lowest-version", "2.1.8", "the lowest tidb version") rootCmd.Flags().StringVar(&repository_url, "source-url", "https://raw.githubusercontent.com/pingcap/tidb-ansible", "the tidb monitor source address") rootCmd.MarkFlagRequired("path") rootCmd.Execute() @@ -117,11 +118,11 @@ func exportMonitorData() { stream.FromArray(refs).Filter(func(ref *plumbing.Reference) bool { return ref.Name().IsTag() - }).Map(func(ref *plumbing.Reference) string{ + }).Map(func(ref *plumbing.Reference) string { return ref.Name().Short() - }).Filter(func(tag string) bool{ + }).Filter(func(tag string) bool { return compareVersion(tag) - }).Map(func (tag string) string{ + }).Map(func(tag string) string { dir := fmt.Sprintf("%s%c%s", monitorDir, filepath.Separator, tag) fmt.Println("tagpath=" + tag) @@ -152,7 +153,7 @@ func exportMonitorData() { // fetchDashboard fetch dashboards from the source and replace some variables in the file. func fetchDashboard(tag string, baseDir string) { dir := fmt.Sprintf("%s%cdashboards", baseDir, filepath.Separator) - checkErr(os.MkdirAll(dir, os.ModePerm), "create dir failed, path=" + dir) + checkErr(os.MkdirAll(dir, os.ModePerm), "create dir failed, path="+dir) stream.FromMapEntries(dashboards).Each(func(entry stream.MapEntry) { dashboard := entry.Key.(reflect.Value).String() @@ -162,7 +163,7 @@ func fetchDashboard(tag string, baseDir string) { } // convertDashboardFileName convert file name -func convertDashboardFileName(dashboard string) string{ +func convertDashboardFileName(dashboard string) string { if strings.HasPrefix(dashboard, "overview") { return "overview.json" } @@ -173,7 +174,7 @@ func convertDashboardFileName(dashboard string) string{ // fetchRules fetch rules from the source func fetchRules(tag string, baseDir string) { dir := fmt.Sprintf("%s%crules", baseDir, filepath.Separator) - checkErr(os.MkdirAll(dir, os.ModePerm), "create dir failed, path=" + dir) + checkErr(os.MkdirAll(dir, os.ModePerm), "create dir failed, path="+dir) stream.FromArray(rules).Each(func(rule string) { body := fetchContent(fmt.Sprintf("%s/%s/roles/prometheus/files/%s", repository_url, tag, rule), tag, rule) @@ -188,7 +189,7 @@ func fetchRules(tag string, baseDir string) { }) } -func fetchContent(url string, tag string, fileName string) string { +func fetchContent(url string, tag string, fileName string) string { r, err := http.NewRequest("GET", url, nil) checkErr(err, "request body failed") @@ -218,24 +219,24 @@ func writeFile(baseDir string, fileName string, body string) { fn := fmt.Sprintf("%s%c%s", baseDir, filepath.Separator, fileName) f, err := os.Create(fn) - checkErr(err, "create file failed, f=" + fn) + checkErr(err, "create file failed, f="+fn) defer f.Close() if _, err := f.WriteString(body); err != nil { - checkErr(err, "write file failed, f=" + fn) + checkErr(err, "write file failed, f="+fn) } } -func filterDashboard(body string, dashboard string, title string) string{ +func filterDashboard(body string, dashboard string, title string) string { newStr := "" stream.Of(body).Filter(func(str string) bool { return str != "" - }).Map(func(str string) string{ + }).Map(func(str string) string { if dashboard != "overview.json" { return str } - stream.FromArray(overviewExlcudeItems).Each(func (item string) { + stream.FromArray(overviewExlcudeItems).Each(func(item string) { str = deleteOverviewItemFromDashboard(str, item) }) @@ -245,7 +246,7 @@ func filterDashboard(body string, dashboard string, title string) string{ return str } - stream.FromArray(tikvExcludeItems).Each(func (item string) { + stream.FromArray(tikvExcludeItems).Each(func(item string) { str = deleteTiKVItemFromDashboard(str, item) }) @@ -260,7 +261,7 @@ func filterDashboard(body string, dashboard string, title string) string{ } return str - }).Map(func (str string) string { + }).Map(func(str string) string { // replace links item if gjson.Get(str, "links").Exists() { newStr, err := sjson.Set(str, "links", []struct{}{}) @@ -269,35 +270,35 @@ func filterDashboard(body string, dashboard string, title string) string{ } return str - }).Map(func (str string) string { + }).Map(func(str string) string { // replace datasource name if gjson.Get(str, "__inputs").Exists() && gjson.Get(str, "__inputs.0.name").Exists() { datasource := gjson.Get(str, "__inputs.0.name").Str return strings.ReplaceAll(str, fmt.Sprintf("${%s}", datasource), datasource_name) } return str - }).Map(func(str string)string { + }).Map(func(str string) string { // delete input defination if gjson.Get(str, "__inputs").Exists() { newStr, err := sjson.Delete(str, "__inputs") - checkErr(err, "delete path failed") + checkErr(err, "delete path failed") return newStr } return str - }).Map(func (str string) string { + }).Map(func(str string) string { // unify the title name - newStr ,err := sjson.Set(str, "title", title) + newStr, err := sjson.Set(str, "title", title) checkErr(err, "replace title failed") return newStr - }).Each(func (str string) { + }).Each(func(str string) { newStr = str }) return newStr } -func deleteOverviewItemFromDashboard(source string, itemName string) string{ +func deleteOverviewItemFromDashboard(source string, itemName string) string { key := getRowsOrPannels(source) for index, r := range gjson.Get(source, key).Array() { @@ -312,7 +313,7 @@ func deleteOverviewItemFromDashboard(source string, itemName string) string{ func deleteTiKVItemFromDashboard(source string, itemName string) string { key := getRowsOrPannels(source) - for index, _ := range gjson.Get(source, key).Array() { + for index, _ := range gjson.Get(source, key).Array() { for index2, r2 := range gjson.Get(source, fmt.Sprintf("%s.%d.panels", key, index)).Array() { if r2.Map()["title"].Str == itemName { return deleteItem(source, fmt.Sprintf("%s.%d.panels.%d", key, index, index2)) @@ -380,7 +381,7 @@ func exist(path string) bool { } } -func replaceAlertExpr(content []byte) ([]byte, error){ +func replaceAlertExpr(content []byte) ([]byte, error) { var groups rulefmt.RuleGroups if err := yaml.UnmarshalStrict(content, &groups); err != nil { return nil, err @@ -390,11 +391,11 @@ func replaceAlertExpr(content []byte) ([]byte, error){ for _, group := range groups.Groups { newG := rulefmt.RuleGroup{ Interval: group.Interval, - Name: group.Name, - Rules: make([]rulefmt.Rule, len(group.Rules)), + Name: group.Name, + Rules: make([]rulefmt.Rule, len(group.Rules)), } - stream.FromArray(group.Rules).Map(func(rule rulefmt.Rule) rulefmt.Rule{ + stream.FromArray(group.Rules).Map(func(rule rulefmt.Rule) rulefmt.Rule { newExpr, ok := needToReplaceExpr[strings.ToUpper(rule.Alert)] if !ok { return rule @@ -416,4 +417,4 @@ func replaceAlertExpr(content []byte) ([]byte, error){ } return yaml.Marshal(newGS) -} \ No newline at end of file +} diff --git a/monitor/v2.1.10/rules/pd.rules.yml b/monitor/v2.1.10/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.10/rules/pd.rules.yml +++ b/monitor/v2.1.10/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.11/rules/pd.rules.yml b/monitor/v2.1.11/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.11/rules/pd.rules.yml +++ b/monitor/v2.1.11/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.12/rules/pd.rules.yml b/monitor/v2.1.12/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.12/rules/pd.rules.yml +++ b/monitor/v2.1.12/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.13/rules/pd.rules.yml b/monitor/v2.1.13/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.13/rules/pd.rules.yml +++ b/monitor/v2.1.13/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.14/rules/pd.rules.yml b/monitor/v2.1.14/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.14/rules/pd.rules.yml +++ b/monitor/v2.1.14/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.15/rules/pd.rules.yml b/monitor/v2.1.15/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.15/rules/pd.rules.yml +++ b/monitor/v2.1.15/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.16/rules/pd.rules.yml b/monitor/v2.1.16/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.16/rules/pd.rules.yml +++ b/monitor/v2.1.16/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.17/rules/pd.rules.yml b/monitor/v2.1.17/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.17/rules/pd.rules.yml +++ b/monitor/v2.1.17/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.18/rules/pd.rules.yml b/monitor/v2.1.18/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.18/rules/pd.rules.yml +++ b/monitor/v2.1.18/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.8/rules/pd.rules.yml b/monitor/v2.1.8/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.8/rules/pd.rules.yml +++ b/monitor/v2.1.8/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v2.1.9/rules/pd.rules.yml b/monitor/v2.1.9/rules/pd.rules.yml index d6650fff..d1b6f28d 100644 --- a/monitor/v2.1.9/rules/pd.rules.yml +++ b/monitor/v2.1.9/rules/pd.rules.yml @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 - for: 1m + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) + for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ diff --git a/monitor/v3.0.0-beta.1/rules/binlog.rules.yml b/monitor/v3.0.0-beta.1/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0-beta.1/rules/binlog.rules.yml +++ b/monitor/v3.0.0-beta.1/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0-beta.1/rules/lightning.rules.yml b/monitor/v3.0.0-beta.1/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0-beta.1/rules/lightning.rules.yml +++ b/monitor/v3.0.0-beta.1/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0-beta.1/rules/pd.rules.yml b/monitor/v3.0.0-beta.1/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0-beta.1/rules/pd.rules.yml +++ b/monitor/v3.0.0-beta.1/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0-beta.1/rules/tidb.rules.yml b/monitor/v3.0.0-beta.1/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0-beta.1/rules/tidb.rules.yml +++ b/monitor/v3.0.0-beta.1/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0-beta.1/rules/tikv-pull.rules.yml b/monitor/v3.0.0-beta.1/rules/tikv-pull.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0-beta.1/rules/tikv-pull.rules.yml +++ b/monitor/v3.0.0-beta.1/rules/tikv-pull.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.0-beta/rules/binlog.rules.yml b/monitor/v3.0.0-beta/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0-beta/rules/binlog.rules.yml +++ b/monitor/v3.0.0-beta/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0-beta/rules/lightning.rules.yml b/monitor/v3.0.0-beta/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0-beta/rules/lightning.rules.yml +++ b/monitor/v3.0.0-beta/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0-beta/rules/pd.rules.yml b/monitor/v3.0.0-beta/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0-beta/rules/pd.rules.yml +++ b/monitor/v3.0.0-beta/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0-beta/rules/tidb.rules.yml b/monitor/v3.0.0-beta/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0-beta/rules/tidb.rules.yml +++ b/monitor/v3.0.0-beta/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0-beta/rules/tikv-pull.rules.yml b/monitor/v3.0.0-beta/rules/tikv-pull.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0-beta/rules/tikv-pull.rules.yml +++ b/monitor/v3.0.0-beta/rules/tikv-pull.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.0-rc.1/rules/binlog.rules.yml b/monitor/v3.0.0-rc.1/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0-rc.1/rules/binlog.rules.yml +++ b/monitor/v3.0.0-rc.1/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0-rc.1/rules/lightning.rules.yml b/monitor/v3.0.0-rc.1/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0-rc.1/rules/lightning.rules.yml +++ b/monitor/v3.0.0-rc.1/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0-rc.1/rules/pd.rules.yml b/monitor/v3.0.0-rc.1/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0-rc.1/rules/pd.rules.yml +++ b/monitor/v3.0.0-rc.1/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0-rc.1/rules/tidb.rules.yml b/monitor/v3.0.0-rc.1/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0-rc.1/rules/tidb.rules.yml +++ b/monitor/v3.0.0-rc.1/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0-rc.1/rules/tikv.rules.yml b/monitor/v3.0.0-rc.1/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0-rc.1/rules/tikv.rules.yml +++ b/monitor/v3.0.0-rc.1/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.0-rc.2/rules/binlog.rules.yml b/monitor/v3.0.0-rc.2/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0-rc.2/rules/binlog.rules.yml +++ b/monitor/v3.0.0-rc.2/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0-rc.2/rules/lightning.rules.yml b/monitor/v3.0.0-rc.2/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0-rc.2/rules/lightning.rules.yml +++ b/monitor/v3.0.0-rc.2/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0-rc.2/rules/pd.rules.yml b/monitor/v3.0.0-rc.2/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0-rc.2/rules/pd.rules.yml +++ b/monitor/v3.0.0-rc.2/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0-rc.2/rules/tidb.rules.yml b/monitor/v3.0.0-rc.2/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0-rc.2/rules/tidb.rules.yml +++ b/monitor/v3.0.0-rc.2/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0-rc.2/rules/tikv.rules.yml b/monitor/v3.0.0-rc.2/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0-rc.2/rules/tikv.rules.yml +++ b/monitor/v3.0.0-rc.2/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.0-rc.3/rules/binlog.rules.yml b/monitor/v3.0.0-rc.3/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0-rc.3/rules/binlog.rules.yml +++ b/monitor/v3.0.0-rc.3/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0-rc.3/rules/lightning.rules.yml b/monitor/v3.0.0-rc.3/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0-rc.3/rules/lightning.rules.yml +++ b/monitor/v3.0.0-rc.3/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0-rc.3/rules/pd.rules.yml b/monitor/v3.0.0-rc.3/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0-rc.3/rules/pd.rules.yml +++ b/monitor/v3.0.0-rc.3/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0-rc.3/rules/tidb.rules.yml b/monitor/v3.0.0-rc.3/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0-rc.3/rules/tidb.rules.yml +++ b/monitor/v3.0.0-rc.3/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0-rc.3/rules/tikv.rules.yml b/monitor/v3.0.0-rc.3/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0-rc.3/rules/tikv.rules.yml +++ b/monitor/v3.0.0-rc.3/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.0/rules/binlog.rules.yml b/monitor/v3.0.0/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.0/rules/binlog.rules.yml +++ b/monitor/v3.0.0/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.0/rules/lightning.rules.yml b/monitor/v3.0.0/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.0/rules/lightning.rules.yml +++ b/monitor/v3.0.0/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.0/rules/pd.rules.yml b/monitor/v3.0.0/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.0/rules/pd.rules.yml +++ b/monitor/v3.0.0/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.0/rules/tidb.rules.yml b/monitor/v3.0.0/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.0/rules/tidb.rules.yml +++ b/monitor/v3.0.0/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.0/rules/tikv.rules.yml b/monitor/v3.0.0/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.0/rules/tikv.rules.yml +++ b/monitor/v3.0.0/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.1/rules/binlog.rules.yml b/monitor/v3.0.1/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.1/rules/binlog.rules.yml +++ b/monitor/v3.0.1/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.1/rules/lightning.rules.yml b/monitor/v3.0.1/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.1/rules/lightning.rules.yml +++ b/monitor/v3.0.1/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.1/rules/pd.rules.yml b/monitor/v3.0.1/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.1/rules/pd.rules.yml +++ b/monitor/v3.0.1/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.1/rules/tidb.rules.yml b/monitor/v3.0.1/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.1/rules/tidb.rules.yml +++ b/monitor/v3.0.1/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.1/rules/tikv.rules.yml b/monitor/v3.0.1/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.1/rules/tikv.rules.yml +++ b/monitor/v3.0.1/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.2/rules/binlog.rules.yml b/monitor/v3.0.2/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.2/rules/binlog.rules.yml +++ b/monitor/v3.0.2/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.2/rules/lightning.rules.yml b/monitor/v3.0.2/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.2/rules/lightning.rules.yml +++ b/monitor/v3.0.2/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.2/rules/pd.rules.yml b/monitor/v3.0.2/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.2/rules/pd.rules.yml +++ b/monitor/v3.0.2/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.2/rules/tidb.rules.yml b/monitor/v3.0.2/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.2/rules/tidb.rules.yml +++ b/monitor/v3.0.2/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.2/rules/tikv.rules.yml b/monitor/v3.0.2/rules/tikv.rules.yml index 2dfeb2ee..c729752c 100644 --- a/monitor/v3.0.2/rules/tikv.rules.yml +++ b/monitor/v3.0.2/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -273,7 +273,7 @@ groups: - alert: TiKV_thread_storage_scheduler_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) @@ -286,7 +286,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -299,7 +299,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -311,7 +311,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -323,7 +323,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -337,7 +337,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -351,7 +351,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -365,7 +365,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -380,7 +380,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.3/rules/binlog.rules.yml b/monitor/v3.0.3/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.3/rules/binlog.rules.yml +++ b/monitor/v3.0.3/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.3/rules/lightning.rules.yml b/monitor/v3.0.3/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.3/rules/lightning.rules.yml +++ b/monitor/v3.0.3/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.3/rules/pd.rules.yml b/monitor/v3.0.3/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.3/rules/pd.rules.yml +++ b/monitor/v3.0.3/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.3/rules/tidb.rules.yml b/monitor/v3.0.3/rules/tidb.rules.yml index 54769e41..01a825c8 100644 --- a/monitor/v3.0.3/rules/tidb.rules.yml +++ b/monitor/v3.0.3/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_server_event{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_backoff_count expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_count[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.3/rules/tikv.rules.yml b/monitor/v3.0.3/rules/tikv.rules.yml index 007c0160..5340d8ba 100644 --- a/monitor/v3.0.3/rules/tikv.rules.yml +++ b/monitor/v3.0.3/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 1.6 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -272,7 +272,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -285,7 +285,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -297,7 +297,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -309,7 +309,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -323,7 +323,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -337,7 +337,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -351,7 +351,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -366,7 +366,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.4/rules/binlog.rules.yml b/monitor/v3.0.4/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.4/rules/binlog.rules.yml +++ b/monitor/v3.0.4/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.4/rules/lightning.rules.yml b/monitor/v3.0.4/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.4/rules/lightning.rules.yml +++ b/monitor/v3.0.4/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.4/rules/pd.rules.yml b/monitor/v3.0.4/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.4/rules/pd.rules.yml +++ b/monitor/v3.0.4/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.4/rules/tidb.rules.yml b/monitor/v3.0.4/rules/tidb.rules.yml index 694dca3d..5f296d65 100644 --- a/monitor/v3.0.4/rules/tidb.rules.yml +++ b/monitor/v3.0.4/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: tidb_tikvclient_backoff_total expr: increase( tidb_tikvclient_backoff_total[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_total[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.4/rules/tikv.rules.yml b/monitor/v3.0.4/rules/tikv.rules.yml index 007c0160..5340d8ba 100644 --- a/monitor/v3.0.4/rules/tikv.rules.yml +++ b/monitor/v3.0.4/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < @@ -31,7 +31,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -44,7 +44,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -56,7 +56,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -69,7 +69,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -83,7 +83,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -97,7 +97,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -111,7 +111,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -125,7 +125,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 1.6 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -139,7 +139,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -153,7 +153,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -167,7 +167,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -181,7 +181,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -194,7 +194,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -206,7 +206,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -219,7 +219,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -233,7 +233,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -246,7 +246,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -259,7 +259,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -272,7 +272,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -285,7 +285,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -297,7 +297,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -309,7 +309,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -323,7 +323,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -337,7 +337,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -351,7 +351,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -366,7 +366,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) diff --git a/monitor/v3.0.5/rules/binlog.rules.yml b/monitor/v3.0.5/rules/binlog.rules.yml index 6f6f841d..e50ef483 100644 --- a/monitor/v3.0.5/rules/binlog.rules.yml +++ b/monitor/v3.0.5/rules/binlog.rules.yml @@ -3,7 +3,6 @@ groups: rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_pump_storage_error_count[1m]) > 0 @@ -15,7 +14,7 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 @@ -28,7 +27,7 @@ groups: - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) @@ -41,7 +40,7 @@ groups: - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) @@ -54,7 +53,7 @@ groups: - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 - for: 5m + for: 10s labels: env: ENV_LABELS_ENV expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * @@ -68,7 +67,7 @@ groups: - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) @@ -81,7 +80,6 @@ groups: value: '{{ $value }}' - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 - for: 5m labels: env: ENV_LABELS_ENV expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 diff --git a/monitor/v3.0.5/rules/lightning.rules.yml b/monitor/v3.0.5/rules/lightning.rules.yml index 24dd41eb..ffba2e13 100644 --- a/monitor/v3.0.5/rules/lightning.rules.yml +++ b/monitor/v3.0.5/rules/lightning.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( lightning_tables{result="failure"} ) > 0 diff --git a/monitor/v3.0.5/rules/pd.rules.yml b/monitor/v3.0.5/rules/pd.rules.yml index b2871786..0a2c7cdd 100644 --- a/monitor/v3.0.5/rules/pd.rules.yml +++ b/monitor/v3.0.5/rules/pd.rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 @@ -16,7 +16,7 @@ groups: - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) @@ -28,11 +28,13 @@ groups: summary: PD_etcd_write_disk_latency value: '{{ $value }}' - alert: PD_miss_peer_region_count - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 5m labels: env: ENV_LABELS_ENV - expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 + expr: (sum( pd_regions_status{type="miss_peer_region_count"} ) by (instance) > + 100) and (sum(etcd_server_is_leader) by (instance) > 0) level: critical annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ @@ -70,7 +72,7 @@ groups: - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) @@ -84,7 +86,7 @@ groups: - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) @@ -97,7 +99,7 @@ groups: value: '{{ $value }}' - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 @@ -109,7 +111,7 @@ groups: value: '{{ $value }}' - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 @@ -135,7 +137,7 @@ groups: value: '{{ $value }}' - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 @@ -148,7 +150,7 @@ groups: - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) @@ -161,7 +163,7 @@ groups: value: '{{ $value }}' - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 @@ -174,7 +176,7 @@ groups: - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) diff --git a/monitor/v3.0.5/rules/tidb.rules.yml b/monitor/v3.0.5/rules/tidb.rules.yml index 694dca3d..5f296d65 100644 --- a/monitor/v3.0.5/rules/tidb.rules.yml +++ b/monitor/v3.0.5/rules/tidb.rules.yml @@ -4,7 +4,7 @@ groups: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 @@ -29,7 +29,7 @@ groups: value: '{{ $value }}' - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 @@ -41,7 +41,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 @@ -53,7 +53,7 @@ groups: value: '{{ $value }}' - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_panic_total[10m]) > 0 @@ -65,7 +65,7 @@ groups: value: '{{ $value }}' - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 @@ -78,7 +78,7 @@ groups: - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) @@ -92,7 +92,7 @@ groups: - alert: TiDB_server_event_error expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > @@ -105,7 +105,7 @@ groups: value: '{{ $value }}' - alert: tidb_tikvclient_backoff_total expr: increase( tidb_tikvclient_backoff_total[10m] ) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase( tidb_tikvclient_backoff_total[10m] ) > 10 @@ -117,7 +117,7 @@ groups: value: '{{ $value }}' - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 @@ -129,7 +129,7 @@ groups: value: '{{ $value }}' - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tidb_ddl_waiting_jobs) > 5 diff --git a/monitor/v3.0.5/rules/tikv.rules.yml b/monitor/v3.0.5/rules/tikv.rules.yml index 20986a01..f458f276 100644 --- a/monitor/v3.0.5/rules/tikv.rules.yml +++ b/monitor/v3.0.5/rules/tikv.rules.yml @@ -17,7 +17,7 @@ groups: value: '{{ $value }}' - alert: TiKV_GC_can_not_work expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 @@ -30,7 +30,7 @@ groups: - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) @@ -43,7 +43,7 @@ groups: value: '{{ $value }}' - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 @@ -55,7 +55,7 @@ groups: value: '{{ $value }}' - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_engine_write_stall[10m]) > 0 @@ -68,7 +68,7 @@ groups: - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) @@ -82,7 +82,7 @@ groups: - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) @@ -96,7 +96,7 @@ groups: - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) @@ -110,7 +110,7 @@ groups: - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) @@ -124,7 +124,7 @@ groups: - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance) > 1.6 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by @@ -138,7 +138,7 @@ groups: - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) @@ -152,7 +152,7 @@ groups: - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) @@ -166,7 +166,7 @@ groups: - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) @@ -180,7 +180,7 @@ groups: - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) @@ -193,7 +193,7 @@ groups: value: '{{ $value }}' - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 @@ -205,7 +205,7 @@ groups: value: '{{ $value }}' - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 @@ -218,7 +218,7 @@ groups: - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) @@ -232,7 +232,7 @@ groups: - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) @@ -245,7 +245,7 @@ groups: value: '{{ $value }}' - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 @@ -258,7 +258,7 @@ groups: - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) @@ -271,7 +271,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_outdated_request_wait_seconds expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > @@ -284,7 +284,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 @@ -296,7 +296,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 @@ -308,7 +308,7 @@ groups: value: '{{ $value }}' - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 @@ -322,7 +322,7 @@ groups: expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) @@ -336,7 +336,7 @@ groups: value: '{{ $value }}' - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 @@ -350,7 +350,7 @@ groups: expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) @@ -365,7 +365,7 @@ groups: - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 - for: 5m + for: 1m labels: env: ENV_LABELS_ENV expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m]))