terraform-datadog-old-monitors/monitors/monitors-linux-basics.log
2017-07-03 19:05:02 +02:00

34 lines
2.8 KiB
Plaintext

name = "CPU High > 80% for 15 min"
message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80"
name = "CPU High > 95% for 5 min"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95"
name = "Free disk space < 5%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5"
name = "Free disk space < 10%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10"
name = "Free disk space < 20%"
message = "${var.warning_HO}"
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20"
name = "Free disk inodes < 5%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5"
name = "Free disk inodes < 10%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10"
name = "Free disk inodes < 20%"
message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}"
query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20"
name = "CPU Load > 2"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
name = "Free memory < 5%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
name = "Host unreachable"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()"