diff --git a/README.md b/README.md index 9937fb7..4439901 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,10 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [sql-database](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/sql-database/) - [storage](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/storage/) - [stream-analytics](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/stream-analytics/) + - [gcp](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/) + - [cloud-sql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/) + - [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/common/) + - [mysql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/mysql/) - [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/) - [alerting-message](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/alerting-message/) - [filter-tags](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/filter-tags/) diff --git a/cloud/gcp/cloud-sql/common/README.md b/cloud/gcp/cloud-sql/common/README.md new file mode 100644 index 0000000..879b1b8 --- /dev/null +++ b/cloud/gcp/cloud-sql/common/README.md @@ -0,0 +1,103 @@ +# CLOUD GCP CLOUD-SQL COMMON DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-gcp-cloud-sql-common" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/gcp/cloud-sql/common?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Cloud SQL CPU Utilization +- Cloud SQL Disk Utilization forecast +- Cloud SQL Disk Utilization +- Cloud SQL Failover Unavailable +- Cloud SQL Memory Utilization forecast +- Cloud SQL Memory Utilization + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_utilization_extra_tags | Extra tags for GCP Cloud SQL CPU Utilization monitor | list | `` | no | +| cpu_utilization_message | Custom message for the CPU Utilization monitor | string | `` | no | +| cpu_utilization_silenced | Groups to mute for GCP Cloud SQL CPU Utilization monitor | map | `` | no | +| cpu_utilization_threshold_critical | CPU Utilization in percentage (critical threshold) | string | `90` | no | +| cpu_utilization_threshold_warning | CPU Utilization in percentage (warning threshold) | string | `80` | no | +| cpu_utilization_time_aggregator | Time aggregator for the CPU Utilization monitor | string | `avg` | no | +| cpu_utilization_timeframe | Timeframe for the CPU Utilization monitor | string | `last_15m` | no | +| disk_utilization_extra_tags | Extra tags for GCP Cloud SQL CPU Utilization monitor | list | `` | no | +| disk_utilization_forecast_algorithm | Algorithm for the Disk Utilization Forecast monitor | string | `linear` | no | +| disk_utilization_forecast_deviations | Deviations for the Disk Utilization Forecast monitor | string | `1` | no | +| disk_utilization_forecast_extra_tags | Extra tags for GCP Cloud SQL Disk Utilization Forecast monitor | list | `` | no | +| disk_utilization_forecast_interval | Interval for the Disk Utilization Forecast monitor | string | `60m` | no | +| disk_utilization_forecast_linear_history | History for the Disk Utilization Forecast monitor | string | `3d` | no | +| disk_utilization_forecast_linear_model | Model for the Disk Utilization Forecast monitor | string | `default` | no | +| disk_utilization_forecast_message | Custom message for the Disk Utilization Forecast monitor | string | `` | no | +| disk_utilization_forecast_seasonal_seasonality | Seasonality for the Disk Utilization Forecast monitor | string | `weekly` | no | +| disk_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Disk Utilization Forecast monitor | map | `` | no | +| disk_utilization_forecast_threshold_critical | Disk Utilization Forecast in percentage (critical threshold) | string | `80` | no | +| disk_utilization_forecast_threshold_critical_recovery | Disk Utilization Forecast in percentage (recovery threshold) | string | `72` | no | +| disk_utilization_forecast_time_aggregator | Time aggregator for the Disk Utilization Forecast monitor | string | `max` | no | +| disk_utilization_forecast_timeframe | Timeframe for the Disk Utilization Forecast monitor | string | `next_1w` | no | +| disk_utilization_message | Custom message for the Disk Utilization monitor | string | `` | no | +| disk_utilization_silenced | Groups to mute for GCP Cloud SQL Disk Utilization monitor | map | `` | no | +| disk_utilization_threshold_critical | Disk Utilization in percentage (critical threshold) | string | `90` | no | +| disk_utilization_threshold_warning | Disk Utilization in percentage (warning threshold) | string | `80` | no | +| disk_utilization_time_aggregator | Time aggregator for the Disk Utilization monitor | string | `avg` | no | +| disk_utilization_timeframe | Timeframe for the Disk Utilization monitor | string | `last_5m` | no | +| environment | Architecture environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | +| failover_unavailable_extra_tags | Extra tags for GCP Cloud SQL Failover Unavailable monitor | list | `` | no | +| failover_unavailable_message | Custom message for the Failover Unavailable monitor | string | `` | no | +| failover_unavailable_silenced | Groups to mute for GCP Cloud SQL Failover Unavailable monitor | map | `` | no | +| failover_unavailable_threshold_critical | Failover Unavailable critical threshold | string | `0` | no | +| failover_unavailable_time_aggregator | Time aggreggator for the Failover Unavailable monitor | string | `max` | no | +| failover_unavailable_timeframe | Timeframe for the Failover Unavailable monitor | string | `last_5m` | no | +| filter_tags | Tags used for filtering | string | `*` | no | +| memory_utilization_extra_tags | Extra tags for GCP Cloud SQL Memory Utilization monitor | list | `` | no | +| memory_utilization_forecast_algorithm | Algorithm for the Memory Utilization Forecast monitor | string | `linear` | no | +| memory_utilization_forecast_deviations | Deviations for the Memory Utilization Forecast monitor | string | `1` | no | +| memory_utilization_forecast_extra_tags | Extra tags for GCP Cloud SQL Memory Utilization Forecast monitor | list | `` | no | +| memory_utilization_forecast_interval | Interval for the Memory Utilization Forecast monitor | string | `30m` | no | +| memory_utilization_forecast_linear_history | History for the Memory Utilization Forecast monitor | string | `12h` | no | +| memory_utilization_forecast_linear_model | Model for the Memory Utilization Forecast monitor | string | `default` | no | +| memory_utilization_forecast_message | Custom message for the Memory Utilization Forecast monitor | string | `` | no | +| memory_utilization_forecast_seasonal_seasonality | Seasonality for the Memory Utilization Forecast monitor | string | `weekly` | no | +| memory_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Memory Utilization Forecast monitor | map | `` | no | +| memory_utilization_forecast_threshold_critical | Memory Utilization Forecast in percentage (warning threshold) | string | `90` | no | +| memory_utilization_forecast_threshold_critical_recovery | Memory Utilization Forecast in percentage (recovery threshold) | string | `81` | no | +| memory_utilization_forecast_time_aggregator | Time aggregator for the Memory Utilization Forecast monitor | string | `max` | no | +| memory_utilization_forecast_timeframe | Timeframe for the Memory Utilization Forecast monitor | string | `next_3d` | no | +| memory_utilization_message | Custom message for the Memory Utilization monitor | string | `` | no | +| memory_utilization_silenced | Groups to mute for GCP Cloud SQL Memory Utilization monitor | map | `` | no | +| memory_utilization_threshold_critical | Memory Utilization in percentage (critical threshold) | string | `90` | no | +| memory_utilization_threshold_warning | Memory Utilization in percentage (warning threshold) | string | `80` | no | +| memory_utilization_time_aggregator | Time aggregator for the Memory Utilization monitor | string | `avg` | no | +| memory_utilization_timeframe | Timeframe for the Memory Utilization monitor | string | `last_5m` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| new_host_delay | Delay in seconds for the new host evaluation | string | `300` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| cpu_utilization_id | id for monitor cpu_utilization | +| disk_utilization_forecast_id | id for monitor disk_utilization_forecast | +| disk_utilization_id | id for monitor disk_utilization | +| failover_unavailable_id | id for monitor failover_unavailable | +| memory_utilization_forecast_id | id for monitor memory_utilization_forecast | +| memory_utilization_id | id for monitor memory_utilization | + +## Related documentation + +* [GCP Metrics for CloudSQL](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql) +* [Datadog Useful monitors for GCP CloudSQL](https://www.datadoghq.com/blog/monitor-google-cloud-sql/) diff --git a/cloud/gcp/cloud-sql/common/inputs.tf b/cloud/gcp/cloud-sql/common/inputs.tf new file mode 100644 index 0000000..e0a2ea8 --- /dev/null +++ b/cloud/gcp/cloud-sql/common/inputs.tf @@ -0,0 +1,358 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds for the new host evaluation" + default = 300 +} + +# +# CPU +# + +variable "cpu_utilization_message" { + description = "Custom message for the CPU Utilization monitor" + type = "string" + default = "" +} + +variable "cpu_utilization_time_aggregator" { + description = "Time aggregator for the CPU Utilization monitor" + type = "string" + default = "avg" +} + +variable "cpu_utilization_timeframe" { + description = "Timeframe for the CPU Utilization monitor" + type = "string" + default = "last_15m" +} + +variable "cpu_utilization_threshold_warning" { + description = "CPU Utilization in percentage (warning threshold)" + type = "string" + default = 80 +} + +variable "cpu_utilization_threshold_critical" { + description = "CPU Utilization in percentage (critical threshold)" + type = "string" + default = 90 +} + +variable "cpu_utilization_silenced" { + description = "Groups to mute for GCP Cloud SQL CPU Utilization monitor" + type = "map" + default = {} +} + +variable "cpu_utilization_extra_tags" { + description = "Extra tags for GCP Cloud SQL CPU Utilization monitor" + type = "list" + default = [] +} + +# +# DISK Utilization +# + +variable "disk_utilization_message" { + description = "Custom message for the Disk Utilization monitor" + type = "string" + default = "" +} + +variable "disk_utilization_time_aggregator" { + description = "Time aggregator for the Disk Utilization monitor" + type = "string" + default = "avg" +} + +variable "disk_utilization_timeframe" { + description = "Timeframe for the Disk Utilization monitor" + type = "string" + default = "last_5m" +} + +variable "disk_utilization_threshold_warning" { + description = "Disk Utilization in percentage (warning threshold)" + type = "string" + default = 80 +} + +variable "disk_utilization_threshold_critical" { + description = "Disk Utilization in percentage (critical threshold)" + type = "string" + default = 90 +} + +variable "disk_utilization_silenced" { + description = "Groups to mute for GCP Cloud SQL Disk Utilization monitor" + type = "map" + default = {} +} + +variable "disk_utilization_extra_tags" { + description = "Extra tags for GCP Cloud SQL CPU Utilization monitor" + type = "list" + default = [] +} + +# +# DISK Utilization Forecast +# + +variable "disk_utilization_forecast_message" { + description = "Custom message for the Disk Utilization Forecast monitor" + type = "string" + default = "" +} + +variable "disk_utilization_forecast_time_aggregator" { + description = "Time aggregator for the Disk Utilization Forecast monitor" + type = "string" + default = "max" +} + +variable "disk_utilization_forecast_timeframe" { + description = "Timeframe for the Disk Utilization Forecast monitor" + type = "string" + default = "next_1w" +} + +variable "disk_utilization_forecast_algorithm" { + description = "Algorithm for the Disk Utilization Forecast monitor" + type = "string" + default = "linear" +} + +variable "disk_utilization_forecast_deviations" { + description = "Deviations for the Disk Utilization Forecast monitor" + type = "string" + default = 1 +} + +variable "disk_utilization_forecast_interval" { + description = "Interval for the Disk Utilization Forecast monitor" + type = "string" + default = "60m" +} + +variable "disk_utilization_forecast_linear_history" { + description = "History for the Disk Utilization Forecast monitor" + type = "string" + default = "3d" +} + +variable "disk_utilization_forecast_linear_model" { + description = "Model for the Disk Utilization Forecast monitor" + type = "string" + default = "default" +} + +variable "disk_utilization_forecast_seasonal_seasonality" { + description = "Seasonality for the Disk Utilization Forecast monitor" + type = "string" + default = "weekly" +} + +variable "disk_utilization_forecast_threshold_critical" { + description = "Disk Utilization Forecast in percentage (critical threshold)" + type = "string" + default = 80 +} + +variable "disk_utilization_forecast_threshold_critical_recovery" { + description = "Disk Utilization Forecast in percentage (recovery threshold)" + type = "string" + default = 72 +} + +variable "disk_utilization_forecast_silenced" { + description = "Groups to mute for GCP Cloud SQL Disk Utilization Forecast monitor" + type = "map" + default = {} +} + +variable "disk_utilization_forecast_extra_tags" { + description = "Extra tags for GCP Cloud SQL Disk Utilization Forecast monitor" + type = "list" + default = [] +} + +# +# Memory Utilization +# + +variable "memory_utilization_message" { + description = "Custom message for the Memory Utilization monitor" + default = "" +} + +variable "memory_utilization_time_aggregator" { + description = "Time aggregator for the Memory Utilization monitor" + default = "avg" +} + +variable "memory_utilization_timeframe" { + description = "Timeframe for the Memory Utilization monitor" + default = "last_5m" +} + +variable "memory_utilization_threshold_warning" { + description = "Memory Utilization in percentage (warning threshold)" + default = 80 +} + +variable "memory_utilization_threshold_critical" { + description = "Memory Utilization in percentage (critical threshold)" + default = 90 +} + +variable "memory_utilization_silenced" { + description = "Groups to mute for GCP Cloud SQL Memory Utilization monitor" + type = "map" + default = {} +} + +variable "memory_utilization_extra_tags" { + description = "Extra tags for GCP Cloud SQL Memory Utilization monitor" + type = "list" + default = [] +} + +# +# Memory Utilization Forecast +# + +variable "memory_utilization_forecast_message" { + description = "Custom message for the Memory Utilization Forecast monitor" + default = "" +} + +variable "memory_utilization_forecast_time_aggregator" { + description = "Time aggregator for the Memory Utilization Forecast monitor" + default = "max" +} + +variable "memory_utilization_forecast_timeframe" { + description = "Timeframe for the Memory Utilization Forecast monitor" + default = "next_3d" +} + +variable "memory_utilization_forecast_algorithm" { + description = "Algorithm for the Memory Utilization Forecast monitor" + type = "string" + default = "linear" +} + +variable "memory_utilization_forecast_deviations" { + description = "Deviations for the Memory Utilization Forecast monitor" + type = "string" + default = 1 +} + +variable "memory_utilization_forecast_interval" { + description = "Interval for the Memory Utilization Forecast monitor" + type = "string" + default = "30m" +} + +variable "memory_utilization_forecast_linear_history" { + description = "History for the Memory Utilization Forecast monitor" + type = "string" + default = "12h" +} + +variable "memory_utilization_forecast_linear_model" { + description = "Model for the Memory Utilization Forecast monitor" + type = "string" + default = "default" +} + +variable "memory_utilization_forecast_seasonal_seasonality" { + description = "Seasonality for the Memory Utilization Forecast monitor" + type = "string" + default = "weekly" +} + +variable "memory_utilization_forecast_threshold_critical" { + description = "Memory Utilization Forecast in percentage (warning threshold)" + default = 90 +} + +variable "memory_utilization_forecast_threshold_critical_recovery" { + description = "Memory Utilization Forecast in percentage (recovery threshold)" + default = 81 +} + +variable "memory_utilization_forecast_silenced" { + description = "Groups to mute for GCP Cloud SQL Memory Utilization Forecast monitor" + type = "map" + default = {} +} + +variable "memory_utilization_forecast_extra_tags" { + description = "Extra tags for GCP Cloud SQL Memory Utilization Forecast monitor" + type = "list" + default = [] +} + +# +# Failover Unavailable +# + +variable "failover_unavailable_message" { + description = "Custom message for the Failover Unavailable monitor" + type = "string" + default = "" +} + +variable "failover_unavailable_time_aggregator" { + description = "Time aggreggator for the Failover Unavailable monitor" + type = "string" + default = "max" +} + +variable "failover_unavailable_timeframe" { + description = "Timeframe for the Failover Unavailable monitor" + type = "string" + default = "last_5m" +} + +variable "failover_unavailable_threshold_critical" { + description = "Failover Unavailable critical threshold" + type = "string" + default = 0 +} + +variable "failover_unavailable_silenced" { + description = "Groups to mute for GCP Cloud SQL Failover Unavailable monitor" + type = "map" + default = {} +} + +variable "failover_unavailable_extra_tags" { + description = "Extra tags for GCP Cloud SQL Failover Unavailable monitor" + type = "list" + default = [] +} diff --git a/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf b/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf new file mode 100644 index 0000000..f4c5c60 --- /dev/null +++ b/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf @@ -0,0 +1,232 @@ +# +# CPU Utilization +# +resource "datadog_monitor" "cpu_utilization" { + name = "[${var.environment}] Cloud SQL CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_utilization_message, var.message)}" + + type = "metric alert" + + query = < ${var.cpu_utilization_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_utilization_threshold_warning}" + critical = "${var.cpu_utilization_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.cpu_utilization_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "${var.cpu_utilization_extra_tags}"] +} + +# +# Disk Utilization +# +resource "datadog_monitor" "disk_utilization" { + name = "[${var.environment}] Cloud SQL Disk Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.disk_utilization_message, var.message)}" + + type = "metric alert" + + query = < ${var.disk_utilization_threshold_critical} +EOF + + thresholds { + warning = "${var.disk_utilization_threshold_warning}" + critical = "${var.disk_utilization_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.disk_utilization_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "${var.disk_utilization_extra_tags}"] +} + +# +# Disk Utilization Forecast +# +resource "datadog_monitor" "disk_utilization_forecast" { + name = "[${var.environment}] Cloud SQL Disk Utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future" + message = "${coalesce(var.disk_utilization_forecast_message, var.message)}" + + type = "metric alert" + + query = <= ${var.disk_utilization_forecast_threshold_critical} +EOF + + thresholds { + critical = "${var.disk_utilization_forecast_threshold_critical}" + critical_recovery = "${var.disk_utilization_forecast_threshold_critical_recovery}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.disk_utilization_forecast_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "${var.disk_utilization_forecast_extra_tags}"] +} + +# +# Memory Utilization +# +resource "datadog_monitor" "memory_utilization" { + name = "[${var.environment}] Cloud SQL Memory Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.memory_utilization_message, var.message)}" + + type = "metric alert" + + query = < ${var.memory_utilization_threshold_critical} +EOF + + thresholds { + warning = "${var.memory_utilization_threshold_warning}" + critical = "${var.memory_utilization_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.memory_utilization_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "${var.memory_utilization_extra_tags}"] +} + +# +# Memory Utilization Forecast +# +resource "datadog_monitor" "memory_utilization_forecast" { + name = "[${var.environment}] Cloud SQL Memory Utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future" + message = "${coalesce(var.memory_utilization_forecast_message, var.message)}" + + type = "query alert" + + query = <= ${var.memory_utilization_forecast_threshold_critical} +EOF + + thresholds { + critical = "${var.memory_utilization_forecast_threshold_critical}" + critical_recovery = "${var.memory_utilization_forecast_threshold_critical_recovery}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.memory_utilization_forecast_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "${var.memory_utilization_forecast_extra_tags}"] +} + +# +# Failover Unavailable +# +resource "datadog_monitor" "failover_unavailable" { + name = "[${var.environment}] Cloud SQL Failover Unavailable" + message = "${coalesce(var.failover_unavailable_message, var.message)}" + + type = "metric alert" + + query = <` | no | +| replication_lag_message | Custom message for the Replication Lag monitor | string | `` | no | +| replication_lag_silenced | Groups to mute for GCP Cloud SQL Replication Lag monitor | map | `` | no | +| replication_lag_threshold_critical | Seconds behind the master (critical threshold) | string | `180` | no | +| replication_lag_threshold_warning | Seconds behind the master (warning threshold) | string | `90` | no | +| replication_lag_time_aggregator | Time aggregator for the Replication Lag monitor | string | `min` | no | +| replication_lag_timeframe | Timeframe for the Replication Lag monitor | string | `last_10m` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| replication_lag_id | id for monitor replication_lag | + +## Related documentation + +* [GCP Metrics for CloudSQL](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql) +* [Datadog Useful monitors for GCP CloudSQL](https://www.datadoghq.com/blog/monitor-google-cloud-sql/) +* [Max connections depends on the type of the instance](https://cloud.google.com/sql/docs/quotas#fixed-limits) +* [Monitoring Replication Lag](https://cloud.google.com/sql/docs/mysql/high-availability#replication-lag-monitor) +* [Monitoring MySQL Performance Metrics](https://www.datadoghq.com/blog/monitoring-mysql-performance-metrics) diff --git a/cloud/gcp/cloud-sql/mysql/inputs.tf b/cloud/gcp/cloud-sql/mysql/inputs.tf new file mode 100644 index 0000000..0b3c567 --- /dev/null +++ b/cloud/gcp/cloud-sql/mysql/inputs.tf @@ -0,0 +1,72 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds for the new host evaluation" + default = 300 +} + +# +# Replication Lag +# + +variable "replication_lag_message" { + description = "Custom message for the Replication Lag monitor" + type = "string" + default = "" +} + +variable "replication_lag_time_aggregator" { + description = "Time aggregator for the Replication Lag monitor" + type = "string" + default = "min" +} + +variable "replication_lag_timeframe" { + description = "Timeframe for the Replication Lag monitor" + type = "string" + default = "last_10m" +} + +variable "replication_lag_threshold_warning" { + description = "Seconds behind the master (warning threshold)" + type = "string" + default = 90 +} + +variable "replication_lag_threshold_critical" { + description = "Seconds behind the master (critical threshold)" + type = "string" + default = 180 +} + +variable "replication_lag_silenced" { + description = "Groups to mute for GCP Cloud SQL Replication Lag monitor" + type = "map" + default = {} +} + +variable "replication_lag_extra_tags" { + description = "Extra tags for GCP Cloud SQL SQL Replication monitor" + type = "list" + default = [] +} diff --git a/cloud/gcp/cloud-sql/mysql/monitors-cloudsql-mysql.tf b/cloud/gcp/cloud-sql/mysql/monitors-cloudsql-mysql.tf new file mode 100644 index 0000000..0e0fd1c --- /dev/null +++ b/cloud/gcp/cloud-sql/mysql/monitors-cloudsql-mysql.tf @@ -0,0 +1,36 @@ +# +# Replication Lag +# +resource "datadog_monitor" "replication_lag" { + name = "[${var.environment}] Cloud SQL MySQL Replication Lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" + message = "${coalesce(var.replication_lag_message, var.message)}" + + type = "metric alert" + + query = < ${var.replication_lag_threshold_critical} +EOF + + thresholds { + critical = "${var.replication_lag_threshold_critical}" + warning = "${var.replication_lag_threshold_warning}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.replication_lag_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:cloud-sql", "team:claranet", "created-by:terraform", "engine:mysql", "${var.replication_lag_extra_tags}"] +} diff --git a/cloud/gcp/cloud-sql/mysql/outputs.tf b/cloud/gcp/cloud-sql/mysql/outputs.tf new file mode 100644 index 0000000..e8e24fd --- /dev/null +++ b/cloud/gcp/cloud-sql/mysql/outputs.tf @@ -0,0 +1,4 @@ +output "replication_lag_id" { + description = "id for monitor replication_lag" + value = "${datadog_monitor.replication_lag.*.id}" +}