From a141f6d5a0bbbd60ef1749d896ff66e426ddf99e Mon Sep 17 00:00:00 2001 From: Kevin Pecquet Date: Mon, 3 Jul 2017 19:05:02 +0200 Subject: [PATCH] Init datadog monitors repository --- inputs.tf | 9 ++ modules.tf | 1 + monitors/monitors-apache-nat.tf | 44 ++++++ monitors/monitors-cassandra.tf | 38 +++++ monitors/monitors-gcp-cloud-sql.tf | 101 +++++++++++++ monitors/monitors-gcp-lb.tf | 87 +++++++++++ monitors/monitors-gcp-vpn.tf | 23 +++ monitors/monitors-kubernetes.tf | 129 ++++++++++++++++ monitors/monitors-linux-basics.log | 33 ++++ monitors/monitors-linux-basics.tf | 210 ++++++++++++++++++++++++++ monitors/monitors-redis-containers.tf | 74 +++++++++ monitors/monitors-redis.tf | 87 +++++++++++ outputs.tf | 5 + resources.sample | 4 + users-datadog.sample | 7 + 15 files changed, 852 insertions(+) create mode 100644 inputs.tf create mode 100644 modules.tf create mode 100644 monitors/monitors-apache-nat.tf create mode 100644 monitors/monitors-cassandra.tf create mode 100644 monitors/monitors-gcp-cloud-sql.tf create mode 100644 monitors/monitors-gcp-lb.tf create mode 100644 monitors/monitors-gcp-vpn.tf create mode 100644 monitors/monitors-kubernetes.tf create mode 100644 monitors/monitors-linux-basics.log create mode 100644 monitors/monitors-linux-basics.tf create mode 100644 monitors/monitors-redis-containers.tf create mode 100644 monitors/monitors-redis.tf create mode 100644 outputs.tf create mode 100644 resources.sample create mode 100644 users-datadog.sample diff --git a/inputs.tf b/inputs.tf new file mode 100644 index 0000000..18c41f6 --- /dev/null +++ b/inputs.tf @@ -0,0 +1,9 @@ +variable "warning_HO" { + type = "string" +} + +variable "alert_HNO" { + type = "string" +} + +variable "project_id" {} diff --git a/modules.tf b/modules.tf new file mode 100644 index 0000000..f9bf1ab --- /dev/null +++ b/modules.tf @@ -0,0 +1 @@ +#TODO diff --git a/monitors/monitors-apache-nat.tf b/monitors/monitors-apache-nat.tf new file mode 100644 index 0000000..7fda5b7 --- /dev/null +++ b/monitors/monitors-apache-nat.tf @@ -0,0 +1,44 @@ +resource "datadog_monitor" "apache_worker_nat" { + name = "Apache proxy busy worker > 99% on nat" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + + query = "avg(last_10m):avg:apache.performance.busy_workers{*} by {host} / ( avg:apache.performance.idle_workers{*} by {host} + avg:apache.performance.busy_workers{*} by {host} ) > 0.99" + type = "query alert" + + thresholds { + warning = 0.95 + critical = 0.99 + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "apache_process_nat" { + name = "Apache proxy is down on nat" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + + query = "\"apache.can_connect\".over(\"*\").by(\"host\",\"port\").last(1).count_by_status()" + type = "service check" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} diff --git a/monitors/monitors-cassandra.tf b/monitors/monitors-cassandra.tf new file mode 100644 index 0000000..2911778 --- /dev/null +++ b/monitors/monitors-cassandra.tf @@ -0,0 +1,38 @@ +resource "datadog_monitor" "datadog_cassandra_down" { + name = "Cassandra service is down" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "\"cassandra.can_connect\".over(\"cassandra-node\").by(\"host\",\"instance\").last(2).count_by_status()" + type = "service check" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +/*resource "datadog_monitor" "datadog_rule_27" { +name = "OpsCenter process is down" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}" +query = "\"process.up\".over(\"opscenter\",process:\"opscenter\").last(1).count_by_status()" +type = "service check" + +notify_no_data = false +renotify_interval = 60 +notify_audit = false +timeout_h = 0 +include_tags = true +locked = false +require_full_window = true +new_host_delay = 300 +notify_no_data = false +renotify_interval = 0 +no_data_timeframe = 20 + +}*/ diff --git a/monitors/monitors-gcp-cloud-sql.tf b/monitors/monitors-gcp-cloud-sql.tf new file mode 100644 index 0000000..f3493a1 --- /dev/null +++ b/monitors/monitors-gcp-cloud-sql.tf @@ -0,0 +1,101 @@ +resource "datadog_monitor" "cloud_sql_cpu_90" { + name = "Cloud SQL CPU high > 90%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "cloud_sql_disk_space" { + name = "Cloud SQL free disk space < 10%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90" + + thresholds { + warning = 70 + critical = 90 + } + + type = "query alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "cloud_sql_connection_80" { +name = "Cloud SQL MySQL connection > 80% of max connections" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "cloud_sql_lag" { +name = "Cloud SQL MySQL lag > 45min" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + +resource "datadog_monitor" "cloud_sql_replication" { +name = "Cloud SQL Failover not ready to replication" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} diff --git a/monitors/monitors-gcp-lb.tf b/monitors/monitors-gcp-lb.tf new file mode 100644 index 0000000..71e7593 --- /dev/null +++ b/monitors/monitors-gcp-lb.tf @@ -0,0 +1,87 @@ +resource "datadog_monitor" "datadog_gcp_lb_request_count" { + name = "GCP LoadBalancer request count changed too fast" + message = "" + query = "change(sum(last_5m),last_30m):avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() >= 300" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + + +resource "datadog_monitor" "datadog_gcp_lb_500" { + name = "GCP LoadBalancer 500 ratio > 5%" + message = "" + query = "sum(last_10m):avg:gcp.loadbalancing.http.request_count{response_code_class:500} by {backend_name}.as_count() / avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() > 0.2" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + + +resource "datadog_monitor" "datadog_gcp_lb_backend_latency" { + name = "GCP LB backend latency > 2s" + message = "" + query = "min(last_5m):avg:gcp.loadbalancing.http.backend_latencies.avg{*} by {backend_name} > 2000" + type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + + +resource "datadog_monitor" "datadog_gcp_lb_latency" { + name = "GCP LB latency > 5s" + message = "" + query = "avg(last_5m):avg:gcp.loadbalancing.http.total_latencies.avg{*} > 5000" + type = "query alert" + + thresholds { + warning = 3000 + critical = 5000 + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} diff --git a/monitors/monitors-gcp-vpn.tf b/monitors/monitors-gcp-vpn.tf new file mode 100644 index 0000000..f7fe6c6 --- /dev/null +++ b/monitors/monitors-gcp-vpn.tf @@ -0,0 +1,23 @@ +resource "datadog_monitor" "gcp_vpn" { + name = "GCP VPN is down" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + + query = "avg(last_5m):avg:gcp.vpn.tunnel_established{*} <= 0" + + thresholds { + critical = 0 + } + + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} diff --git a/monitors/monitors-kubernetes.tf b/monitors/monitors-kubernetes.tf new file mode 100644 index 0000000..3fba84b --- /dev/null +++ b/monitors/monitors-kubernetes.tf @@ -0,0 +1,129 @@ +resource "datadog_monitor" "kubernetes_cluster_cpu" { + name = "Kubernetes cluster CPU High > 85%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85" + + thresholds { + warning = 75 + critical = 85 + } + + type = "query alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "kubernetes_kubelet_check" { + name = "Kubernetes kubelet check down" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" + + thresholds { + warning = 0 + critical = 10 + } + + type = "service check" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "kubernetes_kubelet_ping" { + name = "Kubernetes kubelet ping not ok" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" + + thresholds { + warning = 0 + critical = 10 + } + + type = "service check" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "kubernetes_pods_unavailable" { + name = "Kubernetes pods unavailable" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "kubernetes_node_status" { + name = "Kubernetes node status" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0" + type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + + type = "query alert" + + thresholds { +# warning = 75 + critical = 80 + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} diff --git a/monitors/monitors-linux-basics.log b/monitors/monitors-linux-basics.log new file mode 100644 index 0000000..4d93564 --- /dev/null +++ b/monitors/monitors-linux-basics.log @@ -0,0 +1,33 @@ + name = "CPU High > 80% for 15 min" + message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80" + name = "CPU High > 95% for 5 min" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95" + name = "Free disk space < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" + name = "Free disk space < 10%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10" + name = "Free disk space < 20%" + message = "${var.warning_HO}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20" + name = "Free disk inodes < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5" + name = "Free disk inodes < 10%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10" + name = "Free disk inodes < 20%" + message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}" + query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20" + name = "CPU Load > 2" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2" + name = "Free memory < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" + name = "Host unreachable" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()" diff --git a/monitors/monitors-linux-basics.tf b/monitors/monitors-linux-basics.tf new file mode 100644 index 0000000..20c5a9c --- /dev/null +++ b/monitors/monitors-linux-basics.tf @@ -0,0 +1,210 @@ +resource "datadog_monitor" "cpu_80_15min" { + name = "CPU High > 80% for 15 min" + message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + + query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "cpu_95_5min" { + name = "CPU High > 95% for 5 min" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + + query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_5" { + name = "Free disk space < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_10" { + name = "Free disk space < 10%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_20" { + name = "Free disk space < 20%" + message = "${var.warning_HO}" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { + name = "Free disk inodes < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { + name = "Free disk inodes < 10%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_inodes_20" { + name = "Free disk inodes < 20%" + message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}" + query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_cpu_load" { + name = "CPU Load > 2" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_memory" { + name = "Free memory < 5%" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" + type = "query alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_host_unreachable" { + name = "Host unreachable" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" + query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()" + type = "service check" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = true + renotify_interval = 0 + no_data_timeframe = 20 +} diff --git a/monitors/monitors-redis-containers.tf b/monitors/monitors-redis-containers.tf new file mode 100644 index 0000000..f5bab16 --- /dev/null +++ b/monitors/monitors-redis-containers.tf @@ -0,0 +1,74 @@ +resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" { + name = "Kubernetes Redis container CPU High > 95% for 5 min" + #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95" + + thresholds { +# warning = 80 + critical = 95 + } + + type = "query alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { + name = "Kubernetes Redis container CPU High > 80% for 15 min" + #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + + query = "min(last_15m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 80" + type = "query alert" + + thresholds { +# warning = 75 + critical = 80 + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + +# resource "datadog_monitor" "kubernetes_redis_oom" { +# name = "Kubernetes Redis container out of memory > 85%" +# message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}" +# query = "avg(last_5m):avg:gcp.container.memory.bytes_used{container_name:redis} by {cluster-name} / avg:gcp.container.memory.bytes_total{container_name:redis} by {cluster-name} > 85" +# +# thresholds { +# warning = 70 +# critical = 85 +# } +# +# type = "query alert" +# notify_no_data = false +# renotify_interval = 60 +# notify_audit = false +# timeout_h = 0 +# include_tags = true +# locked = false +# require_full_window = true +# new_host_delay = 300 +# notify_no_data = false +# renotify_interval = 0 +# no_data_timeframe = 20 +# } diff --git a/monitors/monitors-redis.tf b/monitors/monitors-redis.tf new file mode 100644 index 0000000..eacd337 --- /dev/null +++ b/monitors/monitors-redis.tf @@ -0,0 +1,87 @@ +resource "datadog_monitor" "redis_connection" { + name = "Redis connection is down (Datadog check)" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()" + + thresholds { + critical = 50 + warning = 5 + } + + type = "service check" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + + + +resource "datadog_monitor" "redis_eviction" { +name = "Redis eviction > 0" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "min(last_5m):avg:redis.keys.evicted{*} > 0" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + + +resource "datadog_monitor" "datadog_blocked_client" { +name = "Redis blocked clients > 0" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "min(last_5m):avg:redis.clients.blocked{*} > 0" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} + + +resource "datadog_monitor" "redis_swap" { +name = "Redis begin to swap" +message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" +query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8" +type = "metric alert" + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 + +} diff --git a/outputs.tf b/outputs.tf new file mode 100644 index 0000000..0f80405 --- /dev/null +++ b/outputs.tf @@ -0,0 +1,5 @@ +output service_accounts_emails { + value = { + monitoring_datadog = "${google_service_account.monitoring_datadog.email}" + } +} diff --git a/resources.sample b/resources.sample new file mode 100644 index 0000000..bc30868 --- /dev/null +++ b/resources.sample @@ -0,0 +1,4 @@ +resource "google_service_account" "monitoring_datadog" { + account_id = "monitoring-datadog" + display_name = "monitoring datadog" +} diff --git a/users-datadog.sample b/users-datadog.sample new file mode 100644 index 0000000..3fde771 --- /dev/null +++ b/users-datadog.sample @@ -0,0 +1,7 @@ +#Datadog users +resource "datadog_user" "adrien_brefort" { + email = "adrien.brefort@fr.clara.net" + handle = "adrien.brefort@fr.clara.net" + name = "Adrien Bréfort" +} +