From ee70881f11315765ccc414baab763c96e0fdbecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Br=C3=A9fort?= Date: Tue, 23 Jan 2018 15:03:58 +0100 Subject: [PATCH 1/2] TER-209 Add basic CI. --- .gitignore | 13 +++---------- bitbucket-pipelines.yml | 15 +++++++++++++++ main.tf.ci | 7 +++++++ terraform.tfvars.ci | 6 ++++++ 4 files changed, 31 insertions(+), 10 deletions(-) create mode 100644 bitbucket-pipelines.yml create mode 100644 main.tf.ci create mode 100644 terraform.tfvars.ci diff --git a/.gitignore b/.gitignore index a34147e..d1a12b6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,3 @@ -# Ignore all volatile files -**/.terraform/modules -**/.terraform/plugins -**/terraform.tfstate*.backup - -# Ignore all credentials files -**/terraform.tfvars - -# Ignore all but root state files -**/terraform.tfstate +.terraform +main.tf +terraform.tfvars diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml new file mode 100644 index 0000000..7f638d0 --- /dev/null +++ b/bitbucket-pipelines.yml @@ -0,0 +1,15 @@ +image: hashicorp/terraform:0.10.8 + +pipelines: + default: + - step: + name: Format + script: + - terraform fmt -write=false -diff -check + - step: + name: Validate + script: + - mv main.tf.ci main.tf + - mv terraform.tfvars.ci terraform.tfvars + - terraform init + - terraform validate diff --git a/main.tf.ci b/main.tf.ci new file mode 100644 index 0000000..a2d1b10 --- /dev/null +++ b/main.tf.ci @@ -0,0 +1,7 @@ +variable "aws_region" {} + +provider "aws" { + version = "1.2.0" + + region = "${var.aws_region}" +} diff --git a/terraform.tfvars.ci b/terraform.tfvars.ci new file mode 100644 index 0000000..9aa2075 --- /dev/null +++ b/terraform.tfvars.ci @@ -0,0 +1,6 @@ +aws_region="eu-west-1" +region="eu-west-1" +env="test" +hno_escalation_group="abc" +ho_escalation_group="abc" + From eca7bb5272f51b0f04f9ff18af478723f11913b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Br=C3=A9fort?= Date: Tue, 23 Jan 2018 15:05:32 +0100 Subject: [PATCH 2/2] TER-209 Terraform fmt to pass CI. --- .../elasticsearch/monitors-elasticsearch.tf | 9 +- cloud/azure/apimanagement/inputs.tf | 10 +- .../monitors-azure-apimanagement.tf | 11 +- .../app-services/monitors-app_services.tf | 6 +- cloud/azure/eventhub/inputs.tf | 12 +-- cloud/azure/eventhub/monitors-eventhub.tf | 9 +- cloud/azure/iothubs/inputs.tf | 2 +- cloud/azure/redis/inputs.tf | 2 +- cloud/azure/storage/inputs.tf | 21 ++-- cloud/azure/storage/monitors-azure-storage.tf | 23 ++-- cloud/azure/stream-analytics/inputs.tf | 16 +-- .../monitors-stream-analytics.tf | 19 +++- incubator/monitors-gcp-cloud-sql.tf | 100 +++++++++-------- incubator/monitors-kubernetes.tf | 23 ++-- incubator/monitors-linux-basics.tf | 12 +-- incubator/monitors-redis-containers.tf | 13 ++- incubator/monitors-redis.tf | 101 ++++++++---------- inputs.tf | 1 + system/linux/monitors-linux-basics.tf | 2 +- 19 files changed, 200 insertions(+), 192 deletions(-) diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index c28eaed..7ba3aa2 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -11,7 +11,8 @@ resource "datadog_monitor" "es_cluster_status" { name = "[${var.environment}] ElasticSearch cluster status is not green" message = "${var.message}" - type = "query alert" + type = "query alert" + query = < ${var.failed_requests_rate_thresold_critical} EOF - type = "metric alert" + + type = "metric alert" thresholds { critical = "${var.failed_requests_rate_thresold_critical}" @@ -78,7 +80,8 @@ resource "datadog_monitor" "eventhub_errors" { avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() ) * 100 > ${var.errors_rate_thresold_critical} EOF - type = "metric alert" + + type = "metric alert" thresholds { critical = "${var.errors_rate_thresold_critical}" diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 68c9965..1eb0d0d 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -16,7 +16,7 @@ variable "message" { variable "filter_tags" { description = "Tags used for filtering" - default = "*" + default = "*" } # Azure IOT hubs specific diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 49750fa..18f0448 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -1,7 +1,7 @@ # Global Terraform variable "environment" { description = "Architecture environment" - type = "string" + type = "string" } # Global DataDog diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index 5c512b5..e48df74 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -1,7 +1,7 @@ # Global Terraform variable "environment" { description = "Architecture environment" - type = "string" + type = "string" } # Global DataDog @@ -27,46 +27,45 @@ variable "filter_tags_custom" { # Azure Storage specific variable "availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" - default = 90 + default = 90 } variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" - default = 90 + default = 90 } variable "latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" - default = 1000 + default = 1000 } variable "timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" - default = 5 + default = 5 } variable "network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" - default = 5 + default = 5 } variable "throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" - default = 10 + default = 10 } variable "server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" - default = 10 + default = 10 } variable "client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" - default = 15 + default = 15 } variable "authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" - default = 15 + default = 15 } - diff --git a/cloud/azure/storage/monitors-azure-storage.tf b/cloud/azure/storage/monitors-azure-storage.tf index 7466798..0730013 100644 --- a/cloud/azure/storage/monitors-azure-storage.tf +++ b/cloud/azure/storage/monitors-azure-storage.tf @@ -17,7 +17,7 @@ resource "datadog_monitor" "availability" { EOF thresholds { - critical = "${var.availability_threshold_critical}" + critical = "${var.availability_threshold_critical}" } type = "metric alert" @@ -46,7 +46,7 @@ resource "datadog_monitor" "successful_requests" { EOF thresholds { - critical = "${var.successful_requests_threshold_critical}" + critical = "${var.successful_requests_threshold_critical}" } type = "metric alert" @@ -75,7 +75,7 @@ resource "datadog_monitor" "latency" { EOF thresholds { - critical = "${var.latency_threshold_critical}" + critical = "${var.latency_threshold_critical}" } type = "metric alert" @@ -104,7 +104,7 @@ resource "datadog_monitor" "timeout_error_requests" { EOF thresholds { - critical = "${var.timeout_error_requests_threshold_critical}" + critical = "${var.timeout_error_requests_threshold_critical}" } type = "metric alert" @@ -122,7 +122,6 @@ EOF tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] } - resource "datadog_monitor" "network_error_requests" { name = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}" message = "${var.message}" @@ -134,7 +133,7 @@ resource "datadog_monitor" "network_error_requests" { EOF thresholds { - critical = "${var.network_error_requests_threshold_critical}" + critical = "${var.network_error_requests_threshold_critical}" } type = "metric alert" @@ -152,7 +151,6 @@ EOF tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] } - resource "datadog_monitor" "throttling_error_requests" { name = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}" message = "${var.message}" @@ -164,7 +162,7 @@ resource "datadog_monitor" "throttling_error_requests" { EOF thresholds { - critical = "${var.throttling_error_requests_threshold_critical}" + critical = "${var.throttling_error_requests_threshold_critical}" } type = "metric alert" @@ -182,7 +180,6 @@ EOF tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] } - resource "datadog_monitor" "server_other_error_requests" { name = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}" message = "${var.message}" @@ -194,7 +191,7 @@ resource "datadog_monitor" "server_other_error_requests" { EOF thresholds { - critical = "${var.server_other_error_requests_threshold_critical}" + critical = "${var.server_other_error_requests_threshold_critical}" } type = "metric alert" @@ -212,7 +209,6 @@ EOF tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] } - resource "datadog_monitor" "client_other_error_requests" { name = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}" message = "${var.message}" @@ -224,7 +220,7 @@ resource "datadog_monitor" "client_other_error_requests" { EOF thresholds { - critical = "${var.client_other_error_requests_threshold_critical}" + critical = "${var.client_other_error_requests_threshold_critical}" } type = "metric alert" @@ -242,7 +238,6 @@ EOF tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] } - resource "datadog_monitor" "authorization_error_requests" { name = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}" message = "${var.message}" @@ -254,7 +249,7 @@ resource "datadog_monitor" "authorization_error_requests" { EOF thresholds { - critical = "${var.authorization_error_requests_threshold_critical}" + critical = "${var.authorization_error_requests_threshold_critical}" } type = "metric alert" diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index ae1186a..ce3c713 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -27,40 +27,40 @@ variable "filter_tags_custom" { # Azure Stream Analytics specific variable "su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" - default = 60 + default = 60 } variable "su_utilization_threshold_critical" { description = "Streaming Unit utilization rate limit (critical threshold)" - default = 80 + default = 80 } variable "function_requests_threshold_warning" { description = "Failed Function Request rate limit (warning threshold)" - default = 0 + default = 0 } variable "failed_function_requests_threshold_critical" { description = "Failed Function Request rate limit (critical threshold)" - default = 10 + default = 10 } variable "conversion_errors_threshold_warning" { description = "Conversion errors limit (warning threshold)" - default = 0 + default = 0 } variable "conversion_errors_threshold_critical" { description = "Conversion errors limit (critical threshold)" - default = 10 + default = 10 } variable "runtime_errors_threshold_warning" { description = "Runtime errors limit (warning threshold)" - default = 0 + default = 0 } variable "runtime_errors_threshold_critical" { description = "Runtime errors limit (critical threshold)" - default = 10 + default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 7264e2f..1931eb2 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -13,7 +13,8 @@ resource "datadog_monitor" "status" { query = < ${var.su_utilization_threshold_critical} EOF - type = "metric alert" + + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -50,6 +52,7 @@ resource "datadog_monitor" "su_utilization" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + thresholds { warning = "${var.su_utilization_threshold_warning}" critical = "${var.su_utilization_threshold_critical}" @@ -68,7 +71,8 @@ resource "datadog_monitor" "failed_function_requests" { avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() ) * 100 > ${var.failed_function_requests_threshold_critical} EOF - type = "metric alert" + + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -80,6 +84,7 @@ resource "datadog_monitor" "failed_function_requests" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + thresholds { warning = "${var.function_requests_threshold_warning}" critical = "${var.failed_function_requests_threshold_critical}" @@ -97,7 +102,8 @@ resource "datadog_monitor" "conversion_errors" { avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name} ) > ${var.conversion_errors_threshold_critical} EOF - type = "metric alert" + + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -109,6 +115,7 @@ resource "datadog_monitor" "conversion_errors" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + thresholds { warning = "${var.conversion_errors_threshold_warning}" critical = "${var.conversion_errors_threshold_critical}" @@ -126,7 +133,8 @@ resource "datadog_monitor" "runtime_errors" { avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name} ) > ${var.runtime_errors_threshold_critical} EOF - type = "metric alert" + + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -138,6 +146,7 @@ resource "datadog_monitor" "runtime_errors" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + thresholds { warning = "${var.runtime_errors_threshold_warning}" critical = "${var.runtime_errors_threshold_critical}" diff --git a/incubator/monitors-gcp-cloud-sql.tf b/incubator/monitors-gcp-cloud-sql.tf index f3493a1..1b616d5 100644 --- a/incubator/monitors-gcp-cloud-sql.tf +++ b/incubator/monitors-gcp-cloud-sql.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "cloud_sql_cpu_90" { name = "Cloud SQL CPU high > 90%" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90" - type = "query alert" + query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90" + type = "query alert" notify_no_data = false renotify_interval = 60 @@ -18,9 +18,9 @@ resource "datadog_monitor" "cloud_sql_cpu_90" { } resource "datadog_monitor" "cloud_sql_disk_space" { - name = "Cloud SQL free disk space < 10%" + name = "Cloud SQL free disk space < 10%" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90" + query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90" thresholds { warning = 70 @@ -42,60 +42,58 @@ resource "datadog_monitor" "cloud_sql_disk_space" { } resource "datadog_monitor" "cloud_sql_connection_80" { -name = "Cloud SQL MySQL connection > 80% of max connections" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500" -type = "metric alert" + name = "Cloud SQL MySQL connection > 80% of max connections" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500" + type = "metric alert" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } resource "datadog_monitor" "cloud_sql_lag" { -name = "Cloud SQL MySQL lag > 45min" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700" -type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + name = "Cloud SQL MySQL lag > 45min" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700" + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } resource "datadog_monitor" "cloud_sql_replication" { -name = "Cloud SQL Failover not ready to replication" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0" -type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + name = "Cloud SQL Failover not ready to replication" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0" + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } diff --git a/incubator/monitors-kubernetes.tf b/incubator/monitors-kubernetes.tf index 3fba84b..c006e52 100644 --- a/incubator/monitors-kubernetes.tf +++ b/incubator/monitors-kubernetes.tf @@ -1,7 +1,7 @@ resource "datadog_monitor" "kubernetes_cluster_cpu" { name = "Kubernetes cluster CPU High > 85%" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85" + query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85" thresholds { warning = 75 @@ -25,7 +25,7 @@ resource "datadog_monitor" "kubernetes_cluster_cpu" { resource "datadog_monitor" "kubernetes_kubelet_check" { name = "Kubernetes kubelet check down" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" + query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" thresholds { warning = 0 @@ -48,8 +48,10 @@ resource "datadog_monitor" "kubernetes_kubelet_check" { resource "datadog_monitor" "kubernetes_kubelet_ping" { name = "Kubernetes kubelet ping not ok" - message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" - query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" + query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" thresholds { warning = 0 @@ -71,10 +73,10 @@ resource "datadog_monitor" "kubernetes_kubelet_ping" { } resource "datadog_monitor" "kubernetes_pods_unavailable" { - name = "Kubernetes pods unavailable" + name = "Kubernetes pods unavailable" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1" - type = "query alert" + type = "query alert" notify_no_data = false renotify_interval = 60 @@ -90,10 +92,10 @@ resource "datadog_monitor" "kubernetes_pods_unavailable" { } resource "datadog_monitor" "kubernetes_node_status" { - name = "Kubernetes node status" + name = "Kubernetes node status" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0" - type = "metric alert" + type = "metric alert" notify_no_data = false renotify_interval = 60 @@ -108,7 +110,7 @@ resource "datadog_monitor" "kubernetes_node_status" { no_data_timeframe = 20 } - type = "query alert" +/* type = "query alert" thresholds { # warning = 75 @@ -126,4 +128,5 @@ resource "datadog_monitor" "kubernetes_node_status" { notify_no_data = false renotify_interval = 0 no_data_timeframe = 20 -} +}*/ + diff --git a/incubator/monitors-linux-basics.tf b/incubator/monitors-linux-basics.tf index d1fdf80..796a64f 100644 --- a/incubator/monitors-linux-basics.tf +++ b/incubator/monitors-linux-basics.tf @@ -25,7 +25,7 @@ resource "datadog_monitor" "cpu_95_5min" { query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} > 95" type = "query alert" - count = "${var.linux-basics == "enabled" ? 1 : 0}" + count = "${var.linux-basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -41,12 +41,12 @@ resource "datadog_monitor" "cpu_95_5min" { } resource "datadog_monitor" "datadog_free_disk_space_5" { - name = "Free disk space < 5%" + name = "Free disk space < 5%" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" + query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" type = "query alert" - count = "${var.linux-basics == "enabled" ? 1 : 0}" + count = "${var.linux-basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -146,10 +146,10 @@ resource "datadog_monitor" "datadog_cpu_load" { } resource "datadog_monitor" "datadog_free_memory" { - name = "Free memory < 5%" + name = "Free memory < 5%" message = "Debugging alert - no escalation" query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" - type = "query alert" + type = "query alert" notify_no_data = false renotify_interval = 60 diff --git a/incubator/monitors-redis-containers.tf b/incubator/monitors-redis-containers.tf index f5bab16..34e9f80 100644 --- a/incubator/monitors-redis-containers.tf +++ b/incubator/monitors-redis-containers.tf @@ -1,11 +1,12 @@ resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" { - name = "Kubernetes Redis container CPU High > 95% for 5 min" + name = "Kubernetes Redis container CPU High > 95% for 5 min" + #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95" + query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95" thresholds { -# warning = 80 + # warning = 80 critical = 95 } @@ -24,7 +25,8 @@ resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" { } resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { - name = "Kubernetes Redis container CPU High > 80% for 15 min" + name = "Kubernetes Redis container CPU High > 80% for 15 min" + #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" @@ -32,7 +34,7 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { type = "query alert" thresholds { -# warning = 75 + # warning = 75 critical = 80 } @@ -72,3 +74,4 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { # renotify_interval = 0 # no_data_timeframe = 20 # } + diff --git a/incubator/monitors-redis.tf b/incubator/monitors-redis.tf index eacd337..9efec26 100644 --- a/incubator/monitors-redis.tf +++ b/incubator/monitors-redis.tf @@ -1,7 +1,7 @@ resource "datadog_monitor" "redis_connection" { - name = "Redis connection is down (Datadog check)" + name = "Redis connection is down (Datadog check)" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" - query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()" + query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()" thresholds { critical = 50 @@ -22,66 +22,59 @@ resource "datadog_monitor" "redis_connection" { no_data_timeframe = 20 } - - resource "datadog_monitor" "redis_eviction" { -name = "Redis eviction > 0" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "min(last_5m):avg:redis.keys.evicted{*} > 0" -type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + name = "Redis eviction > 0" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "min(last_5m):avg:redis.keys.evicted{*} > 0" + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } - resource "datadog_monitor" "datadog_blocked_client" { -name = "Redis blocked clients > 0" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "min(last_5m):avg:redis.clients.blocked{*} > 0" -type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + name = "Redis blocked clients > 0" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "min(last_5m):avg:redis.clients.blocked{*} > 0" + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } - resource "datadog_monitor" "redis_swap" { -name = "Redis begin to swap" -message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" -query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8" -type = "metric alert" - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 - no_data_timeframe = 20 + name = "Redis begin to swap" + message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" + query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8" + type = "metric alert" + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 } diff --git a/inputs.tf b/inputs.tf index 3571c39..7fd4b4b 100644 --- a/inputs.tf +++ b/inputs.tf @@ -96,6 +96,7 @@ variable "elb_4xx_threshold" { variable "elb_backend_latency" { description = "Average time elapsed after the request leaves the load balancer until a response is received. In seconds" + default = { warning = 1 critical = 5 diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index 73f7ed7..459122d 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -54,7 +54,7 @@ resource "datadog_monitor" "datadog_free_disk_space_5" { name = "[${var.env}] Free disk space < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} * 100 < 5" + query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"