From 324bc463004f1e8a8a0c9ca347afeee3cb3cddf3 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Thu, 27 Jun 2019 12:35:49 +0200 Subject: [PATCH] MON-459 remove incubator --- incubator/monitors-apache-nat.tf | 45 ------ incubator/monitors-cassandra.tf | 38 ----- incubator/monitors-gcp-cloud-sql.tf | 100 ------------- incubator/monitors-gcp-lb.tf | 81 ----------- incubator/monitors-gcp-vpn.tf | 24 ---- incubator/monitors-kubernetes.tf | 131 ----------------- incubator/monitors-linux-basics.log | 33 ----- incubator/monitors-linux-basics.tf | 187 ------------------------- incubator/monitors-redis-containers.tf | 76 ---------- incubator/monitors-redis.tf | 81 ----------- incubator/versions.tf | 4 - 11 files changed, 800 deletions(-) delete mode 100644 incubator/monitors-apache-nat.tf delete mode 100644 incubator/monitors-cassandra.tf delete mode 100644 incubator/monitors-gcp-cloud-sql.tf delete mode 100644 incubator/monitors-gcp-lb.tf delete mode 100644 incubator/monitors-gcp-vpn.tf delete mode 100644 incubator/monitors-kubernetes.tf delete mode 100644 incubator/monitors-linux-basics.log delete mode 100644 incubator/monitors-linux-basics.tf delete mode 100644 incubator/monitors-redis-containers.tf delete mode 100644 incubator/monitors-redis.tf delete mode 100644 incubator/versions.tf diff --git a/incubator/monitors-apache-nat.tf b/incubator/monitors-apache-nat.tf deleted file mode 100644 index f1a2907..0000000 --- a/incubator/monitors-apache-nat.tf +++ /dev/null @@ -1,45 +0,0 @@ -resource "datadog_monitor" "apache_worker_nat" { - name = "Apache proxy busy worker > 99% on nat" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - - query = "avg(last_10m):avg:apache.performance.busy_workers{*} by {host} / ( avg:apache.performance.idle_workers{*} by {host} + avg:apache.performance.busy_workers{*} by {host} ) > 0.99" - type = "query alert" - - thresholds = { - warning = 0.95 - critical = 0.99 - } - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "apache_process_nat" { - name = "Apache proxy is down on nat" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - - query = "\"apache.can_connect\".over(\"*\").by(\"host\",\"port\").last(1).count_by_status()" - type = "service check" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/monitors-cassandra.tf b/incubator/monitors-cassandra.tf deleted file mode 100644 index 7218613..0000000 --- a/incubator/monitors-cassandra.tf +++ /dev/null @@ -1,38 +0,0 @@ -resource "datadog_monitor" "datadog_cassandra_down" { - name = "Cassandra service is down" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "\"cassandra.can_connect\".over(\"cassandra-node\").by(\"host\",\"instance\").last(2).count_by_status()" - type = "service check" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -/*resource "datadog_monitor" "datadog_rule_27" { -name = "OpsCenter process is down" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}" -query = "\"process.up\".over(\"opscenter\",process:\"opscenter\").last(1).count_by_status()" -type = "service check" - -notify_no_data = false -renotify_interval = 60 -notify_audit = false -timeout_h = 0 -include_tags = true -locked = false -require_full_window = true -new_host_delay = 300 -notify_no_data = false -renotify_interval = 0 -no_data_timeframe = 20 - -}*/ diff --git a/incubator/monitors-gcp-cloud-sql.tf b/incubator/monitors-gcp-cloud-sql.tf deleted file mode 100644 index 75dd096..0000000 --- a/incubator/monitors-gcp-cloud-sql.tf +++ /dev/null @@ -1,100 +0,0 @@ -resource "datadog_monitor" "cloud_sql_cpu_90" { - name = "Cloud SQL CPU high > 90%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "cloud_sql_disk_space" { - name = "Cloud SQL free disk space < 10%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90" - - thresholds = { - warning = 70 - critical = 90 - } - - type = "query alert" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "cloud_sql_connection_80" { - name = "Cloud SQL MySQL connection > 80% of max connections" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "cloud_sql_lag" { - name = "Cloud SQL MySQL lag > 45min" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "cloud_sql_replication" { - name = "Cloud SQL Failover not ready to replication" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/monitors-gcp-lb.tf b/incubator/monitors-gcp-lb.tf deleted file mode 100644 index 6d90555..0000000 --- a/incubator/monitors-gcp-lb.tf +++ /dev/null @@ -1,81 +0,0 @@ -resource "datadog_monitor" "datadog_gcp_lb_request_count" { - name = "GCP LoadBalancer request count changed too fast" - message = "" - query = "change(sum(last_5m),last_30m):avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() >= 300" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_gcp_lb_500" { - name = "GCP LoadBalancer 500 ratio > 5%" - message = "" - query = "sum(last_10m):avg:gcp.loadbalancing.http.request_count{response_code_class:500} by {backend_name}.as_count() / avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() > 0.2" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_gcp_lb_backend_latency" { - name = "GCP LB backend latency > 2s" - message = "" - query = "min(last_5m):avg:gcp.loadbalancing.http.backend_latencies.avg{*} by {backend_name} > 2000" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_gcp_lb_latency" { - name = "GCP LB latency > 5s" - message = "" - query = "avg(last_5m):avg:gcp.loadbalancing.http.total_latencies.avg{*} > 5000" - type = "query alert" - - thresholds = { - warning = 3000 - critical = 5000 - } - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/monitors-gcp-vpn.tf b/incubator/monitors-gcp-vpn.tf deleted file mode 100644 index 0d3dd5c..0000000 --- a/incubator/monitors-gcp-vpn.tf +++ /dev/null @@ -1,24 +0,0 @@ -resource "datadog_monitor" "gcp_vpn" { - name = "GCP VPN is down" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - - query = "avg(last_5m):avg:gcp.vpn.tunnel_established{*} <= 0" - - thresholds = { - critical = 0 - } - - type = "metric alert" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/monitors-kubernetes.tf b/incubator/monitors-kubernetes.tf deleted file mode 100644 index f9cb2b6..0000000 --- a/incubator/monitors-kubernetes.tf +++ /dev/null @@ -1,131 +0,0 @@ -resource "datadog_monitor" "kubernetes_cluster_cpu" { - name = "Kubernetes cluster CPU High > 85%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85" - - thresholds = { - warning = 75 - critical = 85 - } - - type = "query alert" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "kubernetes_kubelet_check" { - name = "Kubernetes kubelet check down" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" - - thresholds = { - warning = 0 - critical = 10 - } - - type = "service check" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "kubernetes_kubelet_ping" { - name = "Kubernetes kubelet ping not ok" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" - - thresholds = { - warning = 0 - critical = 10 - } - - type = "service check" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "kubernetes_pods_unavailable" { - name = "Kubernetes pods unavailable" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "kubernetes_node_status" { - name = "Kubernetes node status" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -/* type = "query alert" - - thresholds { -# warning = 75 - critical = 80 - } - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -}*/ diff --git a/incubator/monitors-linux-basics.log b/incubator/monitors-linux-basics.log deleted file mode 100644 index 4d93564..0000000 --- a/incubator/monitors-linux-basics.log +++ /dev/null @@ -1,33 +0,0 @@ - name = "CPU High > 80% for 15 min" - message = "{{#is_alert}}\n${var.alert_HNO}\n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "min(last_15m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 80" - name = "CPU High > 95% for 5 min" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "min(last_5m):avg:system.cpu.system{!goog-gke-node} by {host} + avg:system.cpu.user{!goog-gke-node} by {host} > 95" - name = "Free disk space < 5%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" - name = "Free disk space < 10%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10" - name = "Free disk space < 20%" - message = "${var.warning_HO}" - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 20" - name = "Free disk inodes < 5%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" - query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5" - name = "Free disk inodes < 10%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" - query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10" - name = "Free disk inodes < 20%" - message = "{{#is_alert}}\n${var.warning_HO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.warning_HO} \n{{/is_recovery}}" - query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 20" - name = "CPU Load > 2" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" - query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2" - name = "Free memory < 5%" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" - query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" - name = "Host unreachable" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO} \n{{/is_recovery}}" - query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()" diff --git a/incubator/monitors-linux-basics.tf b/incubator/monitors-linux-basics.tf deleted file mode 100644 index eb8126b..0000000 --- a/incubator/monitors-linux-basics.tf +++ /dev/null @@ -1,187 +0,0 @@ -resource "datadog_monitor" "cpu_80_15min" { - name = "CPU High > 80% for 15 min" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - count = var.linux-basics == "enabled" ? 1 : 0 - - query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} > 80" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "cpu_95_5min" { - name = "CPU High > 95% for 5 min" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - - query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} > 95" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_5" { - name = "Free disk space < 5%" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_10" { - name = "Free disk space < 10%" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 10" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { - name = "Free disk inodes < 5%" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" - - query = "sum(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 5" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { - name = "Free disk inodes < 10%" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" - - query = "max(last_5m):avg:system.fs.inodes.free{*} by {host,device} / avg:system.fs.inodes.total{*} by {host,device} * 100 < 10" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_cpu_load" { - name = "CPU Load > 2" - message = "Debugging alert - no escalation" - - query = "min(last_5m):avg:system.load.5{*} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2" - type = "query alert" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_memory" { - name = "Free memory < 5%" - message = "Debugging alert - no escalation" - query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" - type = "query alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_host_unreachable" { - name = "Host unreachable" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" - - query = "\"datadog.agent.up\".over(\"*\").last(1).count_by_status()" - type = "service check" - count = var.linux-basics == "enabled" ? 1 : 0 - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = true - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/monitors-redis-containers.tf b/incubator/monitors-redis-containers.tf deleted file mode 100644 index ec81d43..0000000 --- a/incubator/monitors-redis-containers.tf +++ /dev/null @@ -1,76 +0,0 @@ -resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" { - name = "Kubernetes Redis container CPU High > 95% for 5 min" - - #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95" - - thresholds = { - # warning = 80 - critical = 95 - } - - type = "query alert" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { - name = "Kubernetes Redis container CPU High > 80% for 15 min" - - #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - - query = "min(last_15m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 80" - type = "query alert" - - thresholds = { - # warning = 75 - critical = 80 - } - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -# resource "datadog_monitor" "kubernetes_redis_oom" { -# name = "Kubernetes Redis container out of memory > 85%" -# message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n${var.warning_HO}" -# query = "avg(last_5m):avg:gcp.container.memory.bytes_used{container_name:redis} by {cluster-name} / avg:gcp.container.memory.bytes_total{container_name:redis} by {cluster-name} > 85" -# -# thresholds { -# warning = 70 -# critical = 85 -# } -# -# type = "query alert" -# notify_no_data = false -# renotify_interval = 60 -# notify_audit = false -# timeout_h = 0 -# include_tags = true -# locked = false -# require_full_window = true -# new_host_delay = 300 -# notify_no_data = false -# renotify_interval = 0 -# no_data_timeframe = 20 -# } diff --git a/incubator/monitors-redis.tf b/incubator/monitors-redis.tf deleted file mode 100644 index 9a982e0..0000000 --- a/incubator/monitors-redis.tf +++ /dev/null @@ -1,81 +0,0 @@ -resource "datadog_monitor" "redis_connection" { - name = "Redis connection is down (Datadog check)" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()" - - thresholds = { - critical = 50 - warning = 5 - } - - type = "service check" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "redis_eviction" { - name = "Redis eviction > 0" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "min(last_5m):avg:redis.keys.evicted{*} > 0" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_blocked_client" { - name = "Redis blocked clients > 0" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "min(last_5m):avg:redis.clients.blocked{*} > 0" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - -resource "datadog_monitor" "redis_swap" { - name = "Redis begin to swap" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8" - type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 -} - diff --git a/incubator/versions.tf b/incubator/versions.tf deleted file mode 100644 index ac97c6a..0000000 --- a/incubator/versions.tf +++ /dev/null @@ -1,4 +0,0 @@ - -terraform { - required_version = ">= 0.12" -}