From 819dc38a0721c2cc45bfe82a51c2a87f2aedb55c Mon Sep 17 00:00:00 2001 From: Maxime de Roucy Date: Thu, 28 Mar 2019 16:22:59 +0100 Subject: [PATCH] MON-424 cloud/gcp/gce/instance: init (cpu utilization + disk OPS throttled) --- README.md | 2 + cloud/gcp/gce/instance/README.md | 70 +++++++ cloud/gcp/gce/instance/inputs.tf | 194 ++++++++++++++++++ .../gcp/gce/instance/monitors-gce-instance.tf | 122 +++++++++++ cloud/gcp/gce/instance/outputs.tf | 14 ++ 5 files changed, 402 insertions(+) create mode 100644 cloud/gcp/gce/instance/README.md create mode 100644 cloud/gcp/gce/instance/inputs.tf create mode 100644 cloud/gcp/gce/instance/monitors-gce-instance.tf create mode 100644 cloud/gcp/gce/instance/outputs.tf diff --git a/README.md b/README.md index 7d5d678..f6bc55d 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [cloud-sql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/) - [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/common/) - [mysql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/mysql/) + - [gce](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/) + - [instance](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/instance/) - [lb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/lb/) - [pubsub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/) - [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/common/) diff --git a/cloud/gcp/gce/instance/README.md b/cloud/gcp/gce/instance/README.md new file mode 100644 index 0000000..42ea323 --- /dev/null +++ b/cloud/gcp/gce/instance/README.md @@ -0,0 +1,70 @@ +# CLOUD GCP GCE INSTANCE DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-gcp-gce-instance" { + source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/gcp/gce/instance?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Compute Engine instance CPU Utilization +- Compute Engine instance Disk Throttled Bps +- Compute Engine instance Disk Throttled OPS + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu\_utilization\_enabled | Flag to enable CPU Utilization monitor | string | `"true"` | no | +| cpu\_utilization\_extra\_tags | Extra tags for CPU Utilization monitor | list | `[]` | no | +| cpu\_utilization\_message | Custom message for the CPU Utilization monitor | string | `""` | no | +| cpu\_utilization\_silenced | Groups to mute for CPU Utilization monitor | map | `{}` | no | +| cpu\_utilization\_threshold\_critical | CPU Utilization in percentage (critical threshold) | string | `"90"` | no | +| cpu\_utilization\_threshold\_warning | CPU Utilization in percentage (warning threshold) | string | `"80"` | no | +| cpu\_utilization\_time\_aggregator | Time aggregator for the CPU Utilization monitor | string | `"avg"` | no | +| cpu\_utilization\_timeframe | Timeframe for the CPU Utilization monitor | string | `"last_15m"` | no | +| disk\_throttled\_bps\_enabled | Flag to enable Disk Throttled Bps monitor | string | `"true"` | no | +| disk\_throttled\_bps\_extra\_tags | Extra tags for Disk Throttled Bps monitor | list | `[]` | no | +| disk\_throttled\_bps\_message | Custom message for the Disk Throttled Bps monitor | string | `""` | no | +| disk\_throttled\_bps\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled Bps monitor | string | `"false"` | no | +| disk\_throttled\_bps\_silenced | Groups to mute for Disk Throttled Bps monitor | map | `{}` | no | +| disk\_throttled\_bps\_threshold\_critical | Disk Throttled Bps in percentage (critical threshold) | string | `"50"` | no | +| disk\_throttled\_bps\_threshold\_warning | Disk Throttled Bps in percentage (warning threshold) | string | `"30"` | no | +| disk\_throttled\_bps\_time\_aggregator | Time aggregator for the Disk Throttled Bps monitor | string | `"min"` | no | +| disk\_throttled\_bps\_timeframe | Timeframe for the Disk Throttled Bps monitor | string | `"last_15m"` | no | +| disk\_throttled\_ops\_enabled | Flag to enable Disk Throttled OPS monitor | string | `"true"` | no | +| disk\_throttled\_ops\_extra\_tags | Extra tags for Disk Throttled OPS monitor | list | `[]` | no | +| disk\_throttled\_ops\_message | Custom message for the Disk Throttled OPS monitor | string | `""` | no | +| disk\_throttled\_ops\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled OPS monitor | string | `"false"` | no | +| disk\_throttled\_ops\_silenced | Groups to mute for Disk Throttled OPS monitor | map | `{}` | no | +| disk\_throttled\_ops\_threshold\_critical | Disk Throttled OPS in percentage (critical threshold) | string | `"50"` | no | +| disk\_throttled\_ops\_threshold\_warning | Disk Throttled OPS in percentage (warning threshold) | string | `"30"` | no | +| disk\_throttled\_ops\_time\_aggregator | Time aggregator for the Disk Throttled OPS monitor | string | `"min"` | no | +| disk\_throttled\_ops\_timeframe | Timeframe for the Disk Throttled OPS monitor | string | `"last_15m"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags | Tags used for filtering | string | `"*"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds for the new host evaluation | string | `"300"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| cpu\_utilization\_id | id for monitor cpu_utilization | +| disk\_throttled\_bps\_id | id for monitor disk_throttled_bps | +| disk\_throttled\_ops\_id | id for monitor disk_throttled_ops | + +## Related documentation + +* [Datadog GCE Instance metrics](https://www.datadoghq.com/blog/monitoring-google-compute-engine-performance/#instance-metrics) +* [GCP Maximum OPS and Bps by device type and size](https://cloud.google.com/compute/docs/disks/) diff --git a/cloud/gcp/gce/instance/inputs.tf b/cloud/gcp/gce/instance/inputs.tf new file mode 100644 index 0000000..300a5b1 --- /dev/null +++ b/cloud/gcp/gce/instance/inputs.tf @@ -0,0 +1,194 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds for the new host evaluation" + default = 300 +} + +# +# CPU +# + +variable "cpu_utilization_message" { + description = "Custom message for the CPU Utilization monitor" + type = "string" + default = "" +} + +variable "cpu_utilization_time_aggregator" { + description = "Time aggregator for the CPU Utilization monitor" + type = "string" + default = "avg" +} + +variable "cpu_utilization_timeframe" { + description = "Timeframe for the CPU Utilization monitor" + type = "string" + default = "last_15m" +} + +variable "cpu_utilization_threshold_warning" { + description = "CPU Utilization in percentage (warning threshold)" + type = "string" + default = 80 +} + +variable "cpu_utilization_threshold_critical" { + description = "CPU Utilization in percentage (critical threshold)" + type = "string" + default = 90 +} + +variable "cpu_utilization_silenced" { + description = "Groups to mute for CPU Utilization monitor" + type = "map" + default = {} +} + +variable "cpu_utilization_enabled" { + description = "Flag to enable CPU Utilization monitor" + type = "string" + default = "true" +} + +variable "cpu_utilization_extra_tags" { + description = "Extra tags for CPU Utilization monitor" + type = "list" + default = [] +} + +# +# Disk Throttled Bps +# + +variable "disk_throttled_bps_message" { + description = "Custom message for the Disk Throttled Bps monitor" + type = "string" + default = "" +} + +variable "disk_throttled_bps_time_aggregator" { + description = "Time aggregator for the Disk Throttled Bps monitor" + type = "string" + default = "min" +} + +variable "disk_throttled_bps_timeframe" { + description = "Timeframe for the Disk Throttled Bps monitor" + type = "string" + default = "last_15m" +} + +variable "disk_throttled_bps_threshold_warning" { + description = "Disk Throttled Bps in percentage (warning threshold)" + type = "string" + default = 30 +} + +variable "disk_throttled_bps_threshold_critical" { + description = "Disk Throttled Bps in percentage (critical threshold)" + type = "string" + default = 50 +} + +variable "disk_throttled_bps_silenced" { + description = "Groups to mute for Disk Throttled Bps monitor" + type = "map" + default = {} +} + +variable "disk_throttled_bps_enabled" { + description = "Flag to enable Disk Throttled Bps monitor" + type = "string" + default = "true" +} + +variable "disk_throttled_bps_extra_tags" { + description = "Extra tags for Disk Throttled Bps monitor" + type = "list" + default = [] +} + +variable "disk_throttled_bps_notify_no_data" { + description = "Flag to enable notification for no data on Disk Throttled Bps monitor" + type = "string" + default = "false" +} + +# +# Disk Throttled OPS +# + +variable "disk_throttled_ops_message" { + description = "Custom message for the Disk Throttled OPS monitor" + type = "string" + default = "" +} + +variable "disk_throttled_ops_time_aggregator" { + description = "Time aggregator for the Disk Throttled OPS monitor" + type = "string" + default = "min" +} + +variable "disk_throttled_ops_timeframe" { + description = "Timeframe for the Disk Throttled OPS monitor" + type = "string" + default = "last_15m" +} + +variable "disk_throttled_ops_threshold_warning" { + description = "Disk Throttled OPS in percentage (warning threshold)" + type = "string" + default = 30 +} + +variable "disk_throttled_ops_threshold_critical" { + description = "Disk Throttled OPS in percentage (critical threshold)" + type = "string" + default = 50 +} + +variable "disk_throttled_ops_silenced" { + description = "Groups to mute for Disk Throttled OPS monitor" + type = "map" + default = {} +} + +variable "disk_throttled_ops_enabled" { + description = "Flag to enable Disk Throttled OPS monitor" + type = "string" + default = "true" +} + +variable "disk_throttled_ops_extra_tags" { + description = "Extra tags for Disk Throttled OPS monitor" + type = "list" + default = [] +} + +variable "disk_throttled_ops_notify_no_data" { + description = "Flag to enable notification for no data on Disk Throttled OPS monitor" + type = "string" + default = "false" +} diff --git a/cloud/gcp/gce/instance/monitors-gce-instance.tf b/cloud/gcp/gce/instance/monitors-gce-instance.tf new file mode 100644 index 0000000..9b4ccc1 --- /dev/null +++ b/cloud/gcp/gce/instance/monitors-gce-instance.tf @@ -0,0 +1,122 @@ +# +# CPU Utilization +# +resource "datadog_monitor" "cpu_utilization" { + count = "${var.cpu_utilization_enabled ? 1 : 0}" + name = "[${var.environment}] Compute Engine instance CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_utilization_message, var.message)}" + + type = "metric alert" + + query = < ${var.cpu_utilization_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_utilization_threshold_warning}" + critical = "${var.cpu_utilization_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = true + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.cpu_utilization_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.cpu_utilization_extra_tags}"] +} + +# +# Disk Throttled Bps +# +resource "datadog_monitor" "disk_throttled_bps" { + count = "${var.disk_throttled_bps_enabled ? 1 : 0}" + name = "[${var.environment}] Compute Engine instance Disk Throttled Bps {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.disk_throttled_bps_message, var.message)}" + + type = "metric alert" + + query = < ${var.disk_throttled_bps_threshold_critical} +EOF + + thresholds { + warning = "${var.disk_throttled_bps_threshold_warning}" + critical = "${var.disk_throttled_bps_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = "${var.disk_throttled_bps_notify_no_data}" + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.disk_throttled_bps_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_bps_extra_tags}"] +} + +# +# Disk Throttled OPS +# +resource "datadog_monitor" "disk_throttled_ops" { + count = "${var.disk_throttled_ops_enabled ? 1 : 0}" + name = "[${var.environment}] Compute Engine instance Disk Throttled OPS {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.disk_throttled_ops_message, var.message)}" + + type = "metric alert" + + query = < ${var.disk_throttled_ops_threshold_critical} +EOF + + thresholds { + warning = "${var.disk_throttled_ops_threshold_warning}" + critical = "${var.disk_throttled_ops_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = "${var.disk_throttled_ops_notify_no_data}" + renotify_interval = 0 + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + silenced = "${var.disk_throttled_ops_silenced}" + + tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_ops_extra_tags}"] +} diff --git a/cloud/gcp/gce/instance/outputs.tf b/cloud/gcp/gce/instance/outputs.tf new file mode 100644 index 0000000..04a5c63 --- /dev/null +++ b/cloud/gcp/gce/instance/outputs.tf @@ -0,0 +1,14 @@ +output "cpu_utilization_id" { + description = "id for monitor cpu_utilization" + value = "${datadog_monitor.cpu_utilization.*.id}" +} + +output "disk_throttled_bps_id" { + description = "id for monitor disk_throttled_bps" + value = "${datadog_monitor.disk_throttled_bps.*.id}" +} + +output "disk_throttled_ops_id" { + description = "id for monitor disk_throttled_ops" + value = "${datadog_monitor.disk_throttled_ops.*.id}" +}