From 54a996e8b769e16a185db02a4f4523e75e9e8fa1 Mon Sep 17 00:00:00 2001 From: Rafael Romero Carmona Date: Fri, 27 Jul 2018 18:15:43 +0200 Subject: [PATCH] MON-224 Monitors imported from the Actualys ones using the exported json --- cloud/gcp/cloud-sql/instance/inputs.tf | 79 ++++++--- .../instance/monitors-cloud-sql-instance.tf | 167 ++++++++++++------ cloud/gcp/cloud-sql/mysql/inputs.tf | 24 ++- .../mysql/monitors-cloudsql-mysql.tf | 97 +++++----- 4 files changed, 243 insertions(+), 124 deletions(-) diff --git a/cloud/gcp/cloud-sql/instance/inputs.tf b/cloud/gcp/cloud-sql/instance/inputs.tf index 3d7370d..845f679 100644 --- a/cloud/gcp/cloud-sql/instance/inputs.tf +++ b/cloud/gcp/cloud-sql/instance/inputs.tf @@ -45,19 +45,19 @@ variable "cpu_utilization_message" { variable "cpu_utilization_timeframe" { description = "Timeframe for the CPU Utilization monitor" type = "string" - default = "last_30m" + default = "last_15m" } variable "cpu_utilization_threshold_warning" { - description = "CPU Utilization in fraction (warning threshold)" + description = "CPU Utilization in percentage (warning threshold)" type = "string" - default = 0.8 + default = 80 } variable "cpu_utilization_threshold_critical" { - description = "CPU Utilization in fraction (critical threshold)" + description = "CPU Utilization in percentage (critical threshold)" type = "string" - default = 0.9 + default = 90 } variable "cpu_utilization_silenced" { @@ -73,7 +73,7 @@ variable "cpu_utilization_extra_tags" { } # -# DISK +# DISK Utilization # variable "disk_utilization_message" { description = "Custom message for the Disk Utilization monitor" @@ -88,15 +88,15 @@ variable "disk_utilization_timeframe" { } variable "disk_utilization_threshold_warning" { - description = "Disk Utilization in fraction (warning threshold)" + description = "Disk Utilization in percentage (warning threshold)" type = "string" - default = 0.8 + default = 80 } variable "disk_utilization_threshold_critical" { - description = "Disk Utilization in fraction (critical threshold)" + description = "Disk Utilization in percentage (critical threshold)" type = "string" - default = 0.9 + default = 90 } variable "disk_utilization_silenced" { @@ -111,6 +111,45 @@ variable "disk_utilization_extra_tags" { default = [] } +# +# DISK Utilization Forecast +# +variable "disk_utilization_forecast_message" { + description = "Custom message for the Disk Utilization monitor" + type = "string" + default = "" +} + +variable "disk_utilization_forecast_timeframe" { + description = "Timeframe for the Disk Utilization monitor" + type = "string" + default = "next_1w" +} + +variable "disk_utilization_forecast_threshold_critical" { + description = "Disk Utilization in percentage (critical threshold)" + type = "string" + default = 80 +} + +variable "disk_utilization_forecast_threshold_critical_recovery" { + description = "Disk Utilization in percentage (recovery threshold)" + type = "string" + default = 72 +} + +variable "disk_utilization_forecast_silenced" { + description = "Groups to mute for GCP Cloud SQL Disk Utilization monitor" + type = "map" + default = {} +} + +variable "disk_utilization_forecast_extra_tags" { + description = "Extra tags for GCP Cloud SQL CPU Utilization monitor" + type = "list" + default = [] +} + # # Memory Utilization # @@ -125,13 +164,13 @@ variable "memory_utilization_timeframe" { } variable "memory_utilization_threshold_warning" { - description = "Memory Utilization in fraction (warning threshold)" - default = 0.8 + description = "Memory Utilization in percentage (warning threshold)" + default = 80 } variable "memory_utilization_threshold_critical" { - description = "Memory Utilization in fraction (critical threshold)" - default = 0.9 + description = "Memory Utilization in percentage (critical threshold)" + default = 90 } variable "memory_utilization_silenced" { @@ -169,14 +208,14 @@ variable "memory_utilization_forecast_history" { default = "12h" } -variable "memory_utilization_forecast_threshold_warning" { - description = "Memory Utilization Forecast in fraction (warning threshold)" - default = 0.8 +variable "memory_utilization_forecast_threshold_critical" { + description = "Memory Utilization Forecast in percentage (warning threshold)" + default = 90 } -variable "memory_utilization_forecast_threshold_critical" { - description = "Memory Utilization Forecast in fraction (critical threshold)" - default = 0.9 +variable "memory_utilization_forecast_threshold_critical_recovery" { + description = "Memory Utilization Forecast in percentage (recovery threshold)" + default = 81 } variable "memory_utilization_forecast_silenced" { diff --git a/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf b/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf index 54a5f91..e326c01 100644 --- a/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf +++ b/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf @@ -15,15 +15,15 @@ data "template_file" "filter" { # CPU Utilization # resource "datadog_monitor" "cpu_utilization" { - name = "[${var.environment}] Cloud SQL CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL CPU utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cpu_utilization_message, var.message)}" type = "metric alert" query = < ${var.cpu_utilization_threshold_critical} EOF @@ -32,16 +32,19 @@ EOF critical = "${var.cpu_utilization_threshold_critical}" } - notify_no_data = true - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.cpu_utilization_silenced}" + no_data_timeframe = 30 + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.cpu_utilization_silenced}" tags = [ "team:gcp", @@ -56,7 +59,7 @@ EOF # Disk Utilization # resource "datadog_monitor" "disk_utilization" { - name = "[${var.environment}] Cloud SQL Disk Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL Disk utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.disk_utilization_message, var.message)}" type = "metric alert" @@ -64,7 +67,7 @@ resource "datadog_monitor" "disk_utilization" { query = < ${var.disk_utilization_threshold_critical} EOF @@ -73,16 +76,19 @@ EOF critical = "${var.disk_utilization_threshold_critical}" } - notify_no_data = true - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.disk_utilization_silenced}" + no_data_timeframe = 20 + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.disk_utilization_silenced}" tags = [ "team:gcp", @@ -93,6 +99,55 @@ EOF ] } +# +# Disk Utilization Forecast +# +resource "datadog_monitor" "disk_utilization_forecast" { + name = "[${var.environment}] Cloud SQL Disk utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future" + message = "${coalesce(var.disk_utilization_forecast_message, var.message)}" + + type = "metric alert" + + query = <= ${var.disk_utilization_forecast_threshold_critical} +EOF + + thresholds { + critical = "${var.disk_utilization_forecast_threshold_critical}" + critical_recovery = "${var.disk_utilization_forecast_threshold_critical_recovery}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.disk_utilization_forecast_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "env:${var.environment}", + "resource:cloud-sql", + "${var.disk_utilization_forecast_extra_tags}", + ] +} + # # Memory Utilization # @@ -105,8 +160,8 @@ resource "datadog_monitor" "memory_utilization" { query = < ${var.memory_utilization_threshold_critical} + by {database_id} * 100 + > ${var.memory_utilization_threshold_critical} EOF thresholds { @@ -114,16 +169,19 @@ EOF critical = "${var.memory_utilization_threshold_critical}" } - notify_no_data = true - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.memory_utilization_silenced}" + no_data_timeframe = 20 + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.memory_utilization_silenced}" tags = [ "team:gcp", @@ -138,7 +196,7 @@ EOF # Memory Utilization Forecast # resource "datadog_monitor" "memory_utilization_forecast" { - name = "[${var.environment}] Cloud SQL Memory Utilization Forecast {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL Memory Utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future" message = "${coalesce(var.memory_utilization_forecast_message, var.message)}" type = "query alert" @@ -146,31 +204,33 @@ resource "datadog_monitor" "memory_utilization_forecast" { query = < ${var.memory_utilization_forecast_threshold_critical} + >= ${var.memory_utilization_forecast_threshold_critical} EOF thresholds { - warning = "${var.memory_utilization_forecast_threshold_warning}" - critical = "${var.memory_utilization_forecast_threshold_critical}" + critical = "${var.memory_utilization_forecast_threshold_critical}" + critical_recovery = "${var.memory_utilization_forecast_threshold_critical_recovery}" } - notify_no_data = true - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.memory_utilization_forecast_silenced}" + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.memory_utilization_forecast_silenced}" tags = [ "team:gcp", @@ -185,32 +245,35 @@ EOF # Failover Unavailable # resource "datadog_monitor" "failover_unavailable" { - name = "[${var.environment}] Cloud SQL Failover Unavailable {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL Failover Unavailable" message = "${coalesce(var.failover_unavailable_message, var.message)}" type = "metric alert" query = < ${var.replication_lag_threshold_critical} + min(${var.replication_lag_timeframe}): + avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}} + by {database_id} + > ${var.replication_lag_threshold_critical} EOF thresholds { @@ -74,16 +74,19 @@ EOF warning = "${var.replication_lag_threshold_warning}" } - notify_no_data = true - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.replication_lag_silenced}" + no_data_timeframe = 25 + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.replication_lag_silenced}" tags = [ "team:gcp", @@ -99,9 +102,7 @@ EOF # Queries Anomaly # resource "datadog_monitor" "queries_changing_anomaly" { - count = "${length(var.queries_changing_database_ids)}" - - name = "[${var.environment}] [${var.queries_changing_database_ids[count.index]}] Cloud SQL MySQL Queries Count changed abnormally {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL MySQL Queries Count changed abnormally" message = "${coalesce(var.queries_changing_message, var.message)}" type = "query alert" @@ -109,32 +110,36 @@ resource "datadog_monitor" "queries_changing_anomaly" { query = < ${var.queries_changing_threshold_critical} EOF thresholds { - warning = "${var.queries_changing_threshold_warning}" - critical = "${var.queries_changing_threshold_critical}" + warning = "${var.queries_changing_threshold_warning}" + critical = "${var.queries_changing_threshold_critical}" + critical_recovery = "${var.queries_changing_threshold_critical_recovery}" } - notify_no_data = false - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.queries_changing_silenced}" + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.queries_changing_silenced}" tags = [ "team:gcp", @@ -142,7 +147,6 @@ EOF "env:${var.environment}", "resource:cloud-sql", "engine:mysql", - "database_id:${var.project_id}:${var.queries_changing_database_ids[count.index]}}", "${var.queries_changing_extra_tags}", ] } @@ -151,42 +155,44 @@ EOF # Questions Anomaly # resource "datadog_monitor" "questions_changing_anomaly" { - count = "${length(var.questions_changing_database_ids)}" - - name = "[${var.environment}] [${var.questions_changing_database_ids[count.index]}] Cloud SQL MySQL Questions Count changed abnormally {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cloud SQL MySQL Questions Count changed abnormally" message = "${coalesce(var.questions_changing_message, var.message)}" type = "query alert" query = < ${var.questions_changing_threshold_critical} + > ${var.questions_changing_threshold_critical} EOF thresholds { - warning = "${var.questions_changing_threshold_warning}" - critical = "${var.questions_changing_threshold_critical}" + warning = "${var.questions_changing_threshold_warning}" + critical = "${var.questions_changing_threshold_critical}" + critical_recovery = "${var.questions_changing_threshold_critical_recovery}" } - notify_no_data = false - require_full_window = false - renotify_interval = 0 notify_audit = false + locked = false timeout_h = 0 include_tags = true - locked = false - evaluation_delay = "${var.delay}" - new_host_delay = "${var.delay}" - silenced = "${var.questions_changing_silenced}" + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.questions_changing_silenced}" tags = [ "team:gcp", @@ -194,7 +200,6 @@ EOF "env:${var.environment}", "resource:cloud-sql", "engine:mysql", - "database_id:${var.project_id}:${var.questions_changing_database_ids[count.index]}", "${var.questions_changing_extra_tags}", ] }