From 19402713c5fc3e63db51c67ab2950db59857152a Mon Sep 17 00:00:00 2001 From: Rafael Romero Carmona Date: Wed, 1 Aug 2018 12:38:10 +0200 Subject: [PATCH] MON-227 First version of the monitors --- README.md | 1 + cloud/gcp/lb/README.md | 81 ++++++++++-- cloud/gcp/lb/inputs.tf | 253 ++++++++++++++++++++++++++++++++++++ cloud/gcp/lb/monitors-lb.tf | 229 ++++++++++++++++++++++++++++++++ cloud/gcp/lb/outputs.tf | 24 ++++ 5 files changed, 575 insertions(+), 13 deletions(-) create mode 100644 cloud/gcp/lb/inputs.tf create mode 100644 cloud/gcp/lb/monitors-lb.tf create mode 100644 cloud/gcp/lb/outputs.tf diff --git a/README.md b/README.md index 1cc513d..a3a0282 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [cloud-sql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/) - [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/common/) - [mysql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/mysql/) + - [lb](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/lb/) - [pubsub](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/pubsub/) - [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/) - [alerting-message](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/alerting-message/) diff --git a/cloud/gcp/lb/README.md b/cloud/gcp/lb/README.md index c1ea99b..e2aaaa0 100644 --- a/cloud/gcp/lb/README.md +++ b/cloud/gcp/lb/README.md @@ -1,28 +1,83 @@ -How to use this module ----------------------- +# CLOUD GCP LB DataDog monitors + +## How to use this module ``` -module "datadog-monitors-gcp-memorystore" { - source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/gcp/memorystore?ref={revision}" +module "datadog-monitors-cloud-gcp-lb" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/gcp/lb?ref={revision}" - project_id = "${var.gcp_project_id}" environment = "${var.environment}" message = "${module.datadog-message-alerting.alerting-message}" } ``` -Purpose -------- -Creates DataDog monitors with the following checks : +## Purpose -* +Creates DataDog monitors with the following checks: -Inputs ------- +- GCP LB 4xx errors +- GCP LB 5xx errors +- GCP LB latency +- GCP LB backend latency +- GCP LB Requests count increased abruptly -Related documentation ------------- +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| backend_latency_extra_tags | Extra tags for GCP LB Backend Latency monitor | list | `` | no | +| backend_latency_message | Custom message for the GCP LB Backend Latency monitor | string | `` | no | +| backend_latency_silenced | Groups to mute for GCP LB Backend Latency monitor | map | `` | no | +| backend_latency_threshold_critical | Latency in seconds (critical threshold) | string | `4000` | no | +| backend_latency_threshold_warning | Latency in seconds (warning threshold) | string | `2000` | no | +| backend_latency_time_aggregator | Timeframe for the GCP LB Backend Latency monitor | string | `min` | no | +| backend_latency_timeframe | Timeframe for the GCP LB Backend Latency monitor | string | `last_10m` | no | +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| error_rate_4xx_extra_tags | Extra tags for GCP LB 4XX Errors monitor | list | `` | no | +| error_rate_4xx_message | Custom message for the GCP LB 4XX Errors monitor | string | `` | no | +| error_rate_4xx_silenced | Groups to mute for GCP LB 4XX Errors monitor | map | `` | no | +| error_rate_4xx_threshold_critical | Rate error in percentage (critical threshold) | string | `50` | no | +| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `sum` | no | +| error_rate_4xx_timeframe | Timeframe for the GCP LB 4XX Errors monitor | string | `last_5m` | no | +| error_rate_5xx_extra_tags | Extra tags for GCP LB 5XX Errors monitor | list | `` | no | +| error_rate_5xx_message | Custom message for the GCP LB 5XX Errors monitor | string | `` | no | +| error_rate_5xx_silenced | Groups to mute for GCP LB 5XX Errors monitor | map | `` | no | +| error_rate_5xx_threshold_critical | Rate error in percentage (critical threshold) | string | `50` | no | +| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `sum` | no | +| error_rate_5xx_timeframe | Timeframe for the GCP LB 5XX Errors monitor | string | `last_5m` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| latency_extra_tags | Extra tags for GCP LB Latency monitor | list | `` | no | +| latency_message | Custom message for the GCP LB Latency monitor | string | `` | no | +| latency_silenced | Groups to mute for GCP LB Latency monitor | map | `` | no | +| latency_threshold_critical | Latency in seconds (critical threshold) | string | `5000` | no | +| latency_threshold_warning | Latency in seconds (warning threshold) | string | `3000` | no | +| latency_time_aggregator | Timeframe for the GCP LB Latency monitor | string | `min` | no | +| latency_timeframe | Timeframe for the GCP LB Latency monitor | string | `last_10m` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| project_id | ID of the GCP Project | string | - | yes | +| request_count_extra_tags | Extra tags for GCP LB Request Count monitor | list | `` | no | +| request_count_message | Custom message for the GCP LB Request Count monitor | string | `` | no | +| request_count_silenced | Groups to mute for GCP LB Request Count monitor | map | `` | no | +| request_count_threshold_critical | Desviation in percentage (critical threshold) | string | `500` | no | +| request_count_threshold_warning | Desviation in percentage (warning threshold) | string | `250` | no | +| request_count_time_aggregator | Timeframe for the GCP LB Request Count monitor | string | `sum` | no | +| request_count_timeframe | Timeframe for the GCP LB Request Count monitor | string | `last_5m` | no | +| request_count_timeshift | Timeshift for the GCP LB Request Count monitor | string | `last_5m` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| backend_latency_id | id for monitor backend_latency | +| error_rate_4xx_id | id for monitor error_rate_4xx | +| error_rate_5xx_id | id for monitor error_rate_5xx | +| latency_id | id for monitor latency | +| request_count_id | id for monitor request_count | + +## Related documentation * [GCP LB Metrics](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-loadbalancing) * [Datadog GCP integration](https://docs.datadoghq.com/integrations/google_cloud_platform/) diff --git a/cloud/gcp/lb/inputs.tf b/cloud/gcp/lb/inputs.tf new file mode 100644 index 0000000..8581b67 --- /dev/null +++ b/cloud/gcp/lb/inputs.tf @@ -0,0 +1,253 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +# +# Filter variables +# +variable "project_id" { + type = "string" + description = "ID of the GCP Project" +} + +# +# 4XX Errors +# +variable "error_rate_4xx_message" { + description = "Custom message for the GCP LB 4XX Errors monitor" + type = "string" + default = "" +} + +variable "error_rate_4xx_time_aggregator" { + description = "Timeframe for the GCP LB 4XX Errors monitor" + type = "string" + default = "sum" +} + +variable "error_rate_4xx_timeframe" { + description = "Timeframe for the GCP LB 4XX Errors monitor" + type = "string" + default = "last_5m" +} + +variable "error_rate_4xx_threshold_critical" { + description = "Rate error in percentage (critical threshold)" + type = "string" + default = 50 +} + +variable "error_rate_4xx_silenced" { + description = "Groups to mute for GCP LB 4XX Errors monitor" + type = "map" + default = {} +} + +variable "error_rate_4xx_extra_tags" { + description = "Extra tags for GCP LB 4XX Errors monitor" + type = "list" + default = [] +} + +# +# 5XX Errors +# +variable "error_rate_5xx_message" { + description = "Custom message for the GCP LB 5XX Errors monitor" + type = "string" + default = "" +} + +variable "error_rate_5xx_time_aggregator" { + description = "Timeframe for the GCP LB 5XX Errors monitor" + type = "string" + default = "sum" +} + +variable "error_rate_5xx_timeframe" { + description = "Timeframe for the GCP LB 5XX Errors monitor" + type = "string" + default = "last_5m" +} + +variable "error_rate_5xx_threshold_critical" { + description = "Rate error in percentage (critical threshold)" + type = "string" + default = 50 +} + +variable "error_rate_5xx_silenced" { + description = "Groups to mute for GCP LB 5XX Errors monitor" + type = "map" + default = {} +} + +variable "error_rate_5xx_extra_tags" { + description = "Extra tags for GCP LB 5XX Errors monitor" + type = "list" + default = [] +} + +# +# Latency +# +variable "latency_message" { + description = "Custom message for the GCP LB Latency monitor" + type = "string" + default = "" +} + +variable "latency_time_aggregator" { + description = "Timeframe for the GCP LB Latency monitor" + type = "string" + default = "min" +} + +variable "latency_timeframe" { + description = "Timeframe for the GCP LB Latency monitor" + type = "string" + default = "last_10m" +} + +variable "latency_threshold_warning" { + description = "Latency in seconds (warning threshold)" + type = "string" + default = 3000 +} + +variable "latency_threshold_critical" { + description = "Latency in seconds (critical threshold)" + type = "string" + default = 5000 +} + +variable "latency_silenced" { + description = "Groups to mute for GCP LB Latency monitor" + type = "map" + default = {} +} + +variable "latency_extra_tags" { + description = "Extra tags for GCP LB Latency monitor" + type = "list" + default = [] +} + +# +# Latency Backend +# +variable "backend_latency_message" { + description = "Custom message for the GCP LB Backend Latency monitor" + type = "string" + default = "" +} + +variable "backend_latency_time_aggregator" { + description = "Timeframe for the GCP LB Backend Latency monitor" + type = "string" + default = "min" +} + +variable "backend_latency_timeframe" { + description = "Timeframe for the GCP LB Backend Latency monitor" + type = "string" + default = "last_10m" +} + +variable "backend_latency_threshold_warning" { + description = "Latency in seconds (warning threshold)" + type = "string" + default = 2000 +} + +variable "backend_latency_threshold_critical" { + description = "Latency in seconds (critical threshold)" + type = "string" + default = 4000 +} + +variable "backend_latency_silenced" { + description = "Groups to mute for GCP LB Backend Latency monitor" + type = "map" + default = {} +} + +variable "backend_latency_extra_tags" { + description = "Extra tags for GCP LB Backend Latency monitor" + type = "list" + default = [] +} + +# +# Request Count +# +variable "request_count_message" { + description = "Custom message for the GCP LB Request Count monitor" + type = "string" + default = "" +} + +variable "request_count_time_aggregator" { + description = "Timeframe for the GCP LB Request Count monitor" + type = "string" + default = "sum" +} + +variable "request_count_timeframe" { + description = "Timeframe for the GCP LB Request Count monitor" + type = "string" + default = "last_5m" +} + +variable "request_count_timeshift" { + description = "Timeshift for the GCP LB Request Count monitor" + type = "string" + default = "last_5m" +} + +variable "request_count_threshold_warning" { + description = "Desviation in percentage (warning threshold)" + type = "string" + default = 250 +} + +variable "request_count_threshold_critical" { + description = "Desviation in percentage (critical threshold)" + type = "string" + default = 500 +} + +variable "request_count_silenced" { + description = "Groups to mute for GCP LB Request Count monitor" + type = "map" + default = {} +} + +variable "request_count_extra_tags" { + description = "Extra tags for GCP LB Request Count monitor" + type = "list" + default = [] +} diff --git a/cloud/gcp/lb/monitors-lb.tf b/cloud/gcp/lb/monitors-lb.tf new file mode 100644 index 0000000..b16fb6b --- /dev/null +++ b/cloud/gcp/lb/monitors-lb.tf @@ -0,0 +1,229 @@ +# +# FILTER +# +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? + format("project_id:%s", var.project_id) : + "${var.filter_tags_custom}"}" + } +} + +# +# 4XX Errors +# +resource "datadog_monitor" "error_rate_4xx" { + name = "[${var.environment}] GCP LB 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.error_rate_4xx_message, var.message)}" + + type = "metric alert" + + query = < ${var.error_rate_4xx_threshold_critical} +EOF + + thresholds { + critical = "${var.error_rate_4xx_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.error_rate_4xx_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "resource:lb", + "env:${var.environment}", + "created_by:terraform", + "${var.error_rate_4xx_extra_tags}", + ] +} + +# +# 5XX Errors +# +resource "datadog_monitor" "error_rate_5xx" { + name = "[${var.environment}] GCP LB 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.error_rate_5xx_message, var.message)}" + + type = "metric alert" + + query = < ${var.error_rate_5xx_threshold_critical} +EOF + + thresholds { + critical = "${var.error_rate_5xx_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.error_rate_5xx_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "resource:lb", + "env:${var.environment}", + "created_by:terraform", + "${var.error_rate_5xx_extra_tags}", + ] +} + +# +# Latency +# +resource "datadog_monitor" "latency" { + name = "[${var.environment}] GCP LB latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" + message = "${coalesce(var.latency_message, var.message)}" + + type = "query alert" + + query = < ${var.latency_threshold_critical} +EOF + + thresholds { + warning = "${var.latency_threshold_warning}" + critical = "${var.latency_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.latency_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "resource:lb", + "env:${var.environment}", + "created_by:terraform", + "${var.latency_extra_tags}", + ] +} + +# +# Backend Latency +# +resource "datadog_monitor" "backend_latency" { + name = "[${var.environment}] GCP LB backend latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" + message = "${coalesce(var.backend_latency_message, var.message)}" + + type = "metric alert" + + query = < ${var.backend_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.backend_latency_threshold_warning}" + critical = "${var.backend_latency_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.backend_latency_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "resource:lb", + "env:${var.environment}", + "created_by:terraform", + "${var.backend_latency_extra_tags}", + ] +} + +# +# Request Count +# +resource "datadog_monitor" "request_count" { + name = "[${var.environment}] GCP LB Requests count increased abruptly" + message = "${coalesce(var.request_count_message, var.message)}" + + type = "query alert" + + query = < ${var.request_count_threshold_critical} +EOF + + thresholds { + warning = "${var.request_count_threshold_warning}" + critical = "${var.request_count_threshold_critical}" + } + + notify_audit = false + locked = false + timeout_h = 0 + include_tags = true + require_full_window = false + notify_no_data = false + renotify_interval = 0 + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + silenced = "${var.request_count_silenced}" + + tags = [ + "team:gcp", + "provider:gcp", + "resource:lb", + "env:${var.environment}", + "created_by:terraform", + "${var.request_count_extra_tags}", + ] +} diff --git a/cloud/gcp/lb/outputs.tf b/cloud/gcp/lb/outputs.tf new file mode 100644 index 0000000..f1d368e --- /dev/null +++ b/cloud/gcp/lb/outputs.tf @@ -0,0 +1,24 @@ +output "error_rate_4xx_id" { + description = "id for monitor error_rate_4xx" + value = "${datadog_monitor.error_rate_4xx.id}" +} + +output "error_rate_5xx_id" { + description = "id for monitor error_rate_5xx" + value = "${datadog_monitor.error_rate_5xx.id}" +} + +output "latency_id" { + description = "id for monitor latency" + value = "${datadog_monitor.latency.id}" +} + +output "backend_latency_id" { + description = "id for monitor backend_latency" + value = "${datadog_monitor.backend_latency.id}" +} + +output "request_count_id" { + description = "id for monitor request_count" + value = "${datadog_monitor.request_count.id}" +}