Init datadog monitors repository

2017-07-03 19:05:02 +02:00 · 2017-07-03 19:05:02 +02:00 · a141f6d5a0
commit a141f6d5a0
15 changed files with 852 additions and 0 deletions
--- a/inputs.tf
+++ b/inputs.tf
@ -0,0 +1,9 @@
+variable "warning_HO" {
+  type = "string"
+}
+
+variable "alert_HNO" {
+  type = "string"
+}
+
+variable "project_id" {}
--- a/modules.tf
+++ b/modules.tf
@ -0,0 +1 @@
+#TODO
--- a/monitors/monitors-apache-nat.tf
+++ b/monitors/monitors-apache-nat.tf
@ -0,0 +1,44 @@
+resource "datadog_monitor" "apache_worker_nat" {
+  name    = "Apache proxy busy worker > 99% on nat"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+
+  query = "avg(last_10m):avg:apache.performance.busy_workers{*} by {host} / ( avg:apache.performance.idle_workers{*} by {host} + avg:apache.performance.busy_workers{*} by {host} ) > 0.99"
+  type  = "query alert"
+
+  thresholds {
+    warning  = 0.95
+    critical = 0.99
+  }
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "apache_process_nat" {
+  name    = "Apache proxy is down on nat"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+
+  query = "\"apache.can_connect\".over(\"*\").by(\"host\",\"port\").last(1).count_by_status()"
+  type  = "service check"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
--- a/monitors/monitors-cassandra.tf
+++ b/monitors/monitors-cassandra.tf
@ -0,0 +1,38 @@
+resource "datadog_monitor" "datadog_cassandra_down" {
+  name   = "Cassandra service is down"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "\"cassandra.can_connect\".over(\"cassandra-node\").by(\"host\",\"instance\").last(2).count_by_status()"
+  type  = "service check"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+/*resource "datadog_monitor" "datadog_rule_27" {
+name   = "OpsCenter process is down"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}"
+query   = "\"process.up\".over(\"opscenter\",process:\"opscenter\").last(1).count_by_status()"
+type  = "service check"
+
+notify_no_data      = false
+renotify_interval   = 60
+notify_audit        = false
+timeout_h           = 0
+include_tags        = true
+locked              = false
+require_full_window = true
+new_host_delay      = 300
+notify_no_data      = false
+renotify_interval   = 0
+no_data_timeframe   = 20
+
+}*/
--- a/monitors/monitors-gcp-cloud-sql.tf
+++ b/monitors/monitors-gcp-cloud-sql.tf
@ -0,0 +1,101 @@
+resource "datadog_monitor" "cloud_sql_cpu_90" {
+  name    = "Cloud SQL CPU high > 90%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90"
+  type                = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "cloud_sql_disk_space" {
+  name   = "Cloud SQL free disk space < 10%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100  >= 90"
+
+  thresholds {
+    warning  = 70
+    critical = 90
+  }
+
+  type                = "query alert"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "cloud_sql_connection_80" {
+name   = "Cloud SQL MySQL connection > 80% of max connections"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "cloud_sql_lag" {
+name   = "Cloud SQL MySQL lag > 45min"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+
+}
+
+resource "datadog_monitor" "cloud_sql_replication" {
+name   = "Cloud SQL Failover not ready to replication"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+
+}
--- a/monitors/monitors-gcp-lb.tf
+++ b/monitors/monitors-gcp-lb.tf
@ -0,0 +1,87 @@
+resource "datadog_monitor" "datadog_gcp_lb_request_count" {
+  name   = "GCP LoadBalancer request count changed too fast"
+  message = ""
+  query   = "change(sum(last_5m),last_30m):avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() >= 300"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+}
+
+
+resource "datadog_monitor" "datadog_gcp_lb_500" {
+  name   = "GCP LoadBalancer 500 ratio > 5%"
+  message = ""
+  query   = "sum(last_10m):avg:gcp.loadbalancing.http.request_count{response_code_class:500} by {backend_name}.as_count() / avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() > 0.2"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+}
+
+
+resource "datadog_monitor" "datadog_gcp_lb_backend_latency" {
+  name   = "GCP LB backend latency > 2s"
+  message = ""
+  query   = "min(last_5m):avg:gcp.loadbalancing.http.backend_latencies.avg{*} by {backend_name} > 2000"
+  type  = "metric alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+}
+
+
+resource "datadog_monitor" "datadog_gcp_lb_latency" {
+  name   = "GCP LB latency > 5s"
+  message = ""
+  query   = "avg(last_5m):avg:gcp.loadbalancing.http.total_latencies.avg{*} > 5000"
+  type  = "query alert"
+
+  thresholds {
+    warning  = 3000
+    critical = 5000
+  }
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+}
--- a/monitors/monitors-gcp-vpn.tf
+++ b/monitors/monitors-gcp-vpn.tf
@ -0,0 +1,23 @@
+resource "datadog_monitor" "gcp_vpn" {
+  name    = "GCP VPN is down"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+
+  query = "avg(last_5m):avg:gcp.vpn.tunnel_established{*} <= 0"
+
+  thresholds {
+    critical = 0
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
--- a/monitors/monitors-kubernetes.tf
+++ b/monitors/monitors-kubernetes.tf
@ -0,0 +1,129 @@
+resource "datadog_monitor" "kubernetes_cluster_cpu" {
+  name    = "Kubernetes cluster CPU High > 85%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85"
+
+  thresholds {
+    warning  = 75
+    critical = 85
+  }
+
+  type                = "query alert"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "kubernetes_kubelet_check" {
+  name    = "Kubernetes kubelet check down"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
+
+  thresholds {
+    warning  = 0
+    critical = 10
+  }
+
+  type                = "service check"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "kubernetes_kubelet_ping" {
+  name    = "Kubernetes kubelet ping not ok"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
+
+  thresholds {
+    warning  = 0
+    critical = 10
+  }
+
+  type                = "service check"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "kubernetes_pods_unavailable" {
+  name   = "Kubernetes pods unavailable"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query   = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "kubernetes_node_status" {
+  name   = "Kubernetes node status"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  query   = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0"
+  type  = "metric alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+  type  = "query alert"
+
+  thresholds {
+#    warning  = 75
+    critical = 80
+  }
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
--- a/monitors/monitors-linux-basics.log
+++ b/monitors/monitors-linux-basics.log
@ -0,0 +1,33 @@
+  name    = "CPU High > 80% for 15 min"
+  message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80"
+  name    = "CPU High > 95% for 5 min"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95"
+  name   = "Free disk space < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5"
+  name   = "Free disk space < 10%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10"
+  name   = "Free disk space < 20%"
+  message = "${var.warning_HO}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20"
+  name   = "Free disk inodes < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5"
+  name   = "Free disk inodes < 10%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10"
+  name   = "Free disk inodes < 20%"
+  message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}"
+  query   = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20"
+  name   = "CPU Load > 2"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
+  name   = "Free memory < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
+  name   = "Host unreachable"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()"
--- a/monitors/monitors-linux-basics.tf
+++ b/monitors/monitors-linux-basics.tf
@ -0,0 +1,210 @@
+resource "datadog_monitor" "cpu_80_15min" {
+  name    = "CPU High > 80% for 15 min"
+  message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+
+  query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "cpu_95_5min" {
+  name    = "CPU High > 95% for 5 min"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+
+  query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_5" {
+  name   = "Free disk space < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_10" {
+  name   = "Free disk space < 10%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_20" {
+  name   = "Free disk space < 20%"
+  message = "${var.warning_HO}"
+  query   = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
+  name   = "Free disk inodes < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_inodes_10" {
+  name   = "Free disk inodes < 10%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_disk_space_inodes_20" {
+  name   = "Free disk inodes < 20%"
+  message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}"
+  query   = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_cpu_load" {
+  name   = "CPU Load > 2"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_free_memory" {
+  name   = "Free memory < 5%"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
+  type  = "query alert"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "datadog_host_unreachable" {
+  name   = "Host unreachable"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}"
+  query   = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()"
+  type  = "service check"
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = true
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
--- a/monitors/monitors-redis-containers.tf
+++ b/monitors/monitors-redis-containers.tf
@ -0,0 +1,74 @@
+resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" {
+  name    = "Kubernetes Redis container CPU High > 95% for 5 min"
+  #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95"
+
+  thresholds {
+#    warning  = 80
+    critical = 95
+  }
+
+  type                = "query alert"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
+  name    = "Kubernetes Redis container CPU High > 80% for 15 min"
+  #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+
+  query = "min(last_15m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 80"
+  type  = "query alert"
+
+  thresholds {
+#    warning  = 75
+    critical = 80
+  }
+
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+# resource "datadog_monitor" "kubernetes_redis_oom" {
+#   name    = "Kubernetes Redis container out of memory > 85%"
+#   message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}"
+#   query = "avg(last_5m):avg:gcp.container.memory.bytes_used{container_name:redis} by {cluster-name} / avg:gcp.container.memory.bytes_total{container_name:redis} by {cluster-name} > 85"
+#
+#   thresholds {
+#     warning  = 70
+#     critical = 85
+#   }
+#
+#   type                = "query alert"
+#   notify_no_data      = false
+#   renotify_interval   = 60
+#   notify_audit        = false
+#   timeout_h           = 0
+#   include_tags        = true
+#   locked              = false
+#   require_full_window = true
+#   new_host_delay      = 300
+#   notify_no_data      = false
+#   renotify_interval   = 0
+#   no_data_timeframe   = 20
+# }
--- a/monitors/monitors-redis.tf
+++ b/monitors/monitors-redis.tf
@ -0,0 +1,87 @@
+resource "datadog_monitor" "redis_connection" {
+  name   = "Redis connection is down (Datadog check)"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()"
+
+  thresholds {
+    critical = 50
+    warning  = 5
+  }
+
+  type                = "service check"
+  notify_no_data      = false
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = 300
+  notify_no_data      = false
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+}
+
+
+
+resource "datadog_monitor" "redis_eviction" {
+name   = "Redis eviction > 0"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "min(last_5m):avg:redis.keys.evicted{*} > 0"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+
+}
+
+
+resource "datadog_monitor" "datadog_blocked_client" {
+name   = "Redis blocked clients > 0"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "min(last_5m):avg:redis.clients.blocked{*} > 0"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+
+}
+
+
+resource "datadog_monitor" "redis_swap" {
+name   = "Redis begin to swap"
+message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+query   = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8"
+type  = "metric alert"
+
+ notify_no_data      = false
+ renotify_interval   = 60
+ notify_audit        = false
+ timeout_h           = 0
+ include_tags        = true
+ locked              = false
+ require_full_window = true
+ new_host_delay      = 300
+ notify_no_data      = false
+ renotify_interval   = 0
+ no_data_timeframe   = 20
+
+}
--- a/outputs.tf
+++ b/outputs.tf
@ -0,0 +1,5 @@
+output service_accounts_emails {
+  value = {
+    monitoring_datadog = "${google_service_account.monitoring_datadog.email}"
+  }
+}
--- a/resources.sample
+++ b/resources.sample
@ -0,0 +1,4 @@
+resource "google_service_account" "monitoring_datadog" {
+  account_id   = "monitoring-datadog"
+  display_name = "monitoring datadog"
+}
--- a/users-datadog.sample
+++ b/users-datadog.sample
@ -0,0 +1,7 @@
+#Datadog users
+resource "datadog_user" "adrien_brefort" {
+  email  = "adrien.brefort@fr.clara.net"
+  handle = "adrien.brefort@fr.clara.net"
+  name   = "Adrien Bréfort"
+}
+