34 lines
2.8 KiB
Plaintext
34 lines
2.8 KiB
Plaintext
name = "CPU High > 80% for 15 min"
|
|
message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
|
|
query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80"
|
|
name = "CPU High > 95% for 5 min"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
|
|
query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95"
|
|
name = "Free disk space < 5%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
|
|
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5"
|
|
name = "Free disk space < 10%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
|
|
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10"
|
|
name = "Free disk space < 20%"
|
|
message = "${var.warning_HO}"
|
|
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20"
|
|
name = "Free disk inodes < 5%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
|
|
query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5"
|
|
name = "Free disk inodes < 10%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
|
|
query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10"
|
|
name = "Free disk inodes < 20%"
|
|
message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}"
|
|
query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20"
|
|
name = "CPU Load > 2"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
|
|
query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
|
|
name = "Free memory < 5%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
|
|
query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
|
|
name = "Host unreachable"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
|
|
query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()"
|