add custom cpu stable

This commit is contained in:
Kevin Pecquet 2017-07-05 19:17:19 +02:00
parent 47bcedf331
commit 7aabbea18c
4 changed files with 72 additions and 16 deletions

26
monitors-custom-cpu.tf Normal file
View File

@ -0,0 +1,26 @@
resource "datadog_monitor" "cpu_custom" {
name = "${var.dd_custom_cpu["name"]}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}"
query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}"
type = "query alert"
thresholds = {
warning = "${var.dd_custom_cpu["warning_threshold"]}"
critical = "${var.dd_custom_cpu["critical_threshold"]}"
}
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
}

View File

@ -1,7 +1,7 @@
resource "datadog_monitor" "cpu_80_15min" {
name = "CPU High > 80% for 15 min"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80"
type = "query alert"
@ -21,11 +21,11 @@ resource "datadog_monitor" "cpu_80_15min" {
resource "datadog_monitor" "cpu_95_5min" {
name = "CPU High > 95% for 5 min"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60
@ -42,11 +42,11 @@ resource "datadog_monitor" "cpu_95_5min" {
resource "datadog_monitor" "datadog_free_disk_space_5" {
name = "Free disk space < 5%"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60
@ -63,11 +63,11 @@ resource "datadog_monitor" "datadog_free_disk_space_5" {
resource "datadog_monitor" "datadog_free_disk_space_10" {
name = "Free disk space < 10%"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
thresholds {
warning = 20
@ -89,11 +89,11 @@ resource "datadog_monitor" "datadog_free_disk_space_10" {
resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
name = "Free disk inodes < 5%"
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}"
query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60
@ -110,11 +110,11 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
resource "datadog_monitor" "datadog_free_disk_space_inodes_10" {
name = "Free disk inodes < 10%"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
thresholds {
warning = 20
@ -140,7 +140,7 @@ resource "datadog_monitor" "datadog_cpu_load" {
query = "min(last_5m):avg:system.load.5{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60
@ -161,7 +161,7 @@ resource "datadog_monitor" "datadog_free_memory" {
query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} * 100 < 5"
type = "query alert"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60
@ -178,11 +178,11 @@ resource "datadog_monitor" "datadog_free_memory" {
resource "datadog_monitor" "datadog_host_unreachable" {
name = "Host unreachable"
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}"
query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()"
type = "service check"
count = "${var.linux_basics == "enabled" ? 1 : 0}"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
notify_no_data = false
renotify_interval = 60

26
testing/inputs.tf Normal file
View File

@ -0,0 +1,26 @@
variable "critical_escalation_group" {
default = "@pagerduty_HODummy"
}
variable "warning_escalation_group" {
default = "@pagerduty_HNODummy"
}
variable "datadog_app_key" {}
variable "datadog_api_key" {}
variable "dd_linux_basics" {
default = "enabled"
}
variable "dd_custom_cpu" {
type = "map"
default = {
status = "enabled"
name = "CPU High > 95% during 1 hour"
period = "last_1h"
critical_threshold = 95
warning_threshold = 90
}
}

4
testing/main.tf Normal file
View File

@ -0,0 +1,4 @@
provider "datadog" {
api_key = "${var.datadog_api_key}"
app_key = "${var.datadog_app_key}"
}