From 70a45ed9f68cd1bb3ed1ea380f1f62583053ad90 Mon Sep 17 00:00:00 2001 From: Rafael Romero Carmona Date: Thu, 14 Jun 2018 10:55:03 +0200 Subject: [PATCH] MON-224 Standard and recommended monitors with their inputs and readme --- cloud/gcp/cloud-sql/instance/README.md | 32 ++++ cloud/gcp/cloud-sql/instance/inputs.tf | 161 ++++++++++++++++++ .../instance/monitors-cloud-sql-instance.tf | 142 +++++++++++++++ 3 files changed, 335 insertions(+) create mode 100644 cloud/gcp/cloud-sql/instance/README.md create mode 100644 cloud/gcp/cloud-sql/instance/inputs.tf create mode 100644 cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf diff --git a/cloud/gcp/cloud-sql/instance/README.md b/cloud/gcp/cloud-sql/instance/README.md new file mode 100644 index 0000000..805c302 --- /dev/null +++ b/cloud/gcp/cloud-sql/instance/README.md @@ -0,0 +1,32 @@ + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_message | Custom message for the CPU Utilization monitor | string | `` | no | +| cpu_silenced | Groups to mute for GCP Cloud SQL CPU Utilization monitor | map | `` | no | +| cpu_tags | Tags to add to the CPU Utilization monitors | map | `` | no | +| cpu_threshold_critical | CPU Utilization in fraction (critical threshold) | string | `0.9` | no | +| cpu_threshold_warning | CPU Utilization in fraction (warning threshold) | string | `0.85` | no | +| cpu_timeframe | Timeframe for the CPU Utilization monitor | string | `last_2h` | no | +| database_id | ID of the Cloud SQL Database Instance | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| disk_message | Custom message for the Disk Utilization monitor | string | `` | no | +| disk_silenced | Groups to mute for GCP Cloud SQL Disk Utilization monitor | map | `` | no | +| disk_tags | Tags to add to the Disk Utilization monitors | map | `` | no | +| disk_threshold_critical | Disk Utilization in fraction (critical threshold) | string | `0.9` | no | +| disk_threshold_warning | Disk Utilization in fraction (warning threshold) | string | `0.8` | no | +| disk_timeframe | Timeframe for the Disk Utilization monitor | string | `last_5m` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| network_connections_hard_limit | Max number of network connections | string | - | yes | +| network_connections_message | Custom message for the Netowork Connections monitor | string | `` | no | +| network_connections_silenced | Groups to mute for GCP Cloud SQL Network Connections monitor | map | `` | no | +| network_connections_tags | Tags to add to the Network Connections monitors | map | `` | no | +| network_connections_threshold_critical | Fraction of network connections (warning threshold) | string | `0.9` | no | +| network_connections_threshold_warning | Fraction of network connections (warning threshold) | string | `0.8` | no | +| network_connections_timeframe | Timeframe for the Network Connections monitor | string | `last_5m` | no | +| project_id | ID of the GCP Project | string | - | yes | + diff --git a/cloud/gcp/cloud-sql/instance/inputs.tf b/cloud/gcp/cloud-sql/instance/inputs.tf new file mode 100644 index 0000000..6cb511b --- /dev/null +++ b/cloud/gcp/cloud-sql/instance/inputs.tf @@ -0,0 +1,161 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +# +# Filter variables +# +variable "project_id" { + type = "string" + description = "ID of the GCP Project" +} + +variable "database_id" { + type = "string" + description = "ID of the Cloud SQL Database Instance" +} + +# +# CPU +# +variable "cpu_message" { + description = "Custom message for the CPU Utilization monitor" + type = "string" + default = "" +} + +variable "cpu_timeframe" { + description = "Timeframe for the CPU Utilization monitor" + type = "string" + default = "last_2h" +} + +variable "cpu_threshold_warning" { + description = "CPU Utilization in fraction (warning threshold)" + type = "string" + default = 0.85 +} + +variable "cpu_threshold_critical" { + description = "CPU Utilization in fraction (critical threshold)" + type = "string" + default = 0.9 +} + +variable "cpu_silenced" { + description = "Groups to mute for GCP Cloud SQL CPU Utilization monitor" + type = "map" + default = {} +} + +variable "cpu_tags" { + description = "Tags to add to the CPU Utilization monitors" + type = "map" + default = {} +} + +# +# DISK +# +variable "disk_message" { + description = "Custom message for the Disk Utilization monitor" + type = "string" + default = "" +} + +variable "disk_timeframe" { + description = "Timeframe for the Disk Utilization monitor" + type = "string" + default = "last_5m" +} + +variable "disk_threshold_warning" { + description = "Disk Utilization in fraction (warning threshold)" + type = "string" + default = 0.8 +} + +variable "disk_threshold_critical" { + description = "Disk Utilization in fraction (critical threshold)" + type = "string" + default = 0.9 +} + +variable "disk_silenced" { + description = "Groups to mute for GCP Cloud SQL Disk Utilization monitor" + type = "map" + default = {} +} + +variable "disk_tags" { + description = "Tags to add to the Disk Utilization monitors" + type = "map" + default = {} +} + +# +# Network Connections +# +variable "network_connections_message" { + description = "Custom message for the Netowork Connections monitor" + type = "string" + default = "" +} + +variable "network_connections_timeframe" { + description = "Timeframe for the Network Connections monitor" + type = "string" + default = "last_5m" +} + +variable "network_connections_hard_limit" { + description = "Max number of network connections" + type = "string" +} + +variable "network_connections_threshold_warning" { + description = "Fraction of network connections (warning threshold)" + type = "string" + default = 0.8 +} + +variable "network_connections_threshold_critical" { + description = "Fraction of network connections (warning threshold)" + type = "string" + default = 0.9 +} + +variable "network_connections_silenced" { + description = "Groups to mute for GCP Cloud SQL Network Connections monitor" + type = "map" + default = {} +} + +variable "network_connections_tags" { + description = "Tags to add to the Network Connections monitors" + type = "map" + default = {} +} diff --git a/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf b/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf new file mode 100644 index 0000000..73a51fc --- /dev/null +++ b/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf @@ -0,0 +1,142 @@ +# +# FILTER +# +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? + format("project_id:%s", var.project_id) : + "${var.filter_tags_custom}"}" + } +} + +# +# CPU Utilization +# +resource "datadog_monitor" "cpu_utilization" { + name = "[${var.environment}] Cloud SQL CPU utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_message, var.message)}" + + type = "metric alert" + + query = < ${var.cpu_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_threshold_warning}" + critical = "${var.cpu_threshold_critical}" + } + + include_tags = true + notify_no_data = true + require_full_window = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + silenced = "${var.cpu_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "env:${var.environment}", + "resource:cloud-sql", + "${var.cpu_tags}", + ] +} + +# +# Disk Utilization +# +resource "datadog_monitor" "disk_utilization" { + name = "[${var.environment}] Cloud SQL Disk utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.disk_message, var.message)}" + + type = "metric alert" + + query = < ${var.disk_threshold_critical} +EOF + + thresholds { + warning = "${var.disk_threshold_warning}" + critical = "${var.disk_threshold_critical}" + } + + include_tags = true + notify_no_data = true + require_full_window = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + silenced = "${var.disk_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "env:${var.environment}", + "resource:cloud-sql", + "${var.disk_tags}", + ] +} + +# +# Network Connections +# +resource "datadog_monitor" "network_connections" { + name = "[${var.environment}] Cloud SQL Network Connections (hard limit: ${var.network_connections_hard_limit}) {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.network_connections_message, var.message)}" + + type = "metric alert" + + query = < ${var.network_connections_threshold_critical} +EOF + + thresholds { + warning = "${var.network_connections_threshold_warning}" + critical = "${var.network_connections_threshold_critical}" + } + + include_tags = true + notify_no_data = true + require_full_window = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + silenced = "${var.network_connections_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "env:${var.environment}", + "resource:cloud-sql", + "${var.network_connections_tags}", + ] +}