MON-227 First version of the monitors

This commit is contained in:
Rafael Romero Carmona 2018-08-01 12:38:10 +02:00 committed by Quentin Manfroi
parent 4af00f8ed0
commit 19402713c5
5 changed files with 575 additions and 13 deletions

View File

@ -96,6 +96,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [cloud-sql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/)
- [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/common/)
- [mysql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/cloud-sql/mysql/)
- [lb](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/lb/)
- [pubsub](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/gcp/pubsub/)
- [common](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/)
- [alerting-message](https://bitbucket.org/morea/terraform.feature.datadog/src/master/common/alerting-message/)

View File

@ -1,28 +1,83 @@
How to use this module
----------------------
# CLOUD GCP LB DataDog monitors
## How to use this module
```
module "datadog-monitors-gcp-memorystore" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/gcp/memorystore?ref={revision}"
module "datadog-monitors-cloud-gcp-lb" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/gcp/lb?ref={revision}"
project_id = "${var.gcp_project_id}"
environment = "${var.environment}"
message = "${module.datadog-message-alerting.alerting-message}"
}
```
Purpose
-------
Creates DataDog monitors with the following checks :
## Purpose
*
Creates DataDog monitors with the following checks:
Inputs
------
- GCP LB 4xx errors
- GCP LB 5xx errors
- GCP LB latency
- GCP LB backend latency
- GCP LB Requests count increased abruptly
Related documentation
------------
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| backend_latency_extra_tags | Extra tags for GCP LB Backend Latency monitor | list | `<list>` | no |
| backend_latency_message | Custom message for the GCP LB Backend Latency monitor | string | `` | no |
| backend_latency_silenced | Groups to mute for GCP LB Backend Latency monitor | map | `<map>` | no |
| backend_latency_threshold_critical | Latency in seconds (critical threshold) | string | `4000` | no |
| backend_latency_threshold_warning | Latency in seconds (warning threshold) | string | `2000` | no |
| backend_latency_time_aggregator | Timeframe for the GCP LB Backend Latency monitor | string | `min` | no |
| backend_latency_timeframe | Timeframe for the GCP LB Backend Latency monitor | string | `last_10m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| environment | Architecture environment | string | - | yes |
| error_rate_4xx_extra_tags | Extra tags for GCP LB 4XX Errors monitor | list | `<list>` | no |
| error_rate_4xx_message | Custom message for the GCP LB 4XX Errors monitor | string | `` | no |
| error_rate_4xx_silenced | Groups to mute for GCP LB 4XX Errors monitor | map | `<map>` | no |
| error_rate_4xx_threshold_critical | Rate error in percentage (critical threshold) | string | `50` | no |
| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `sum` | no |
| error_rate_4xx_timeframe | Timeframe for the GCP LB 4XX Errors monitor | string | `last_5m` | no |
| error_rate_5xx_extra_tags | Extra tags for GCP LB 5XX Errors monitor | list | `<list>` | no |
| error_rate_5xx_message | Custom message for the GCP LB 5XX Errors monitor | string | `` | no |
| error_rate_5xx_silenced | Groups to mute for GCP LB 5XX Errors monitor | map | `<map>` | no |
| error_rate_5xx_threshold_critical | Rate error in percentage (critical threshold) | string | `50` | no |
| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `sum` | no |
| error_rate_5xx_timeframe | Timeframe for the GCP LB 5XX Errors monitor | string | `last_5m` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| latency_extra_tags | Extra tags for GCP LB Latency monitor | list | `<list>` | no |
| latency_message | Custom message for the GCP LB Latency monitor | string | `` | no |
| latency_silenced | Groups to mute for GCP LB Latency monitor | map | `<map>` | no |
| latency_threshold_critical | Latency in seconds (critical threshold) | string | `5000` | no |
| latency_threshold_warning | Latency in seconds (warning threshold) | string | `3000` | no |
| latency_time_aggregator | Timeframe for the GCP LB Latency monitor | string | `min` | no |
| latency_timeframe | Timeframe for the GCP LB Latency monitor | string | `last_10m` | no |
| message | Message sent when a monitor is triggered | string | - | yes |
| project_id | ID of the GCP Project | string | - | yes |
| request_count_extra_tags | Extra tags for GCP LB Request Count monitor | list | `<list>` | no |
| request_count_message | Custom message for the GCP LB Request Count monitor | string | `` | no |
| request_count_silenced | Groups to mute for GCP LB Request Count monitor | map | `<map>` | no |
| request_count_threshold_critical | Desviation in percentage (critical threshold) | string | `500` | no |
| request_count_threshold_warning | Desviation in percentage (warning threshold) | string | `250` | no |
| request_count_time_aggregator | Timeframe for the GCP LB Request Count monitor | string | `sum` | no |
| request_count_timeframe | Timeframe for the GCP LB Request Count monitor | string | `last_5m` | no |
| request_count_timeshift | Timeshift for the GCP LB Request Count monitor | string | `last_5m` | no |
## Outputs
| Name | Description |
|------|-------------|
| backend_latency_id | id for monitor backend_latency |
| error_rate_4xx_id | id for monitor error_rate_4xx |
| error_rate_5xx_id | id for monitor error_rate_5xx |
| latency_id | id for monitor latency |
| request_count_id | id for monitor request_count |
## Related documentation
* [GCP LB Metrics](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-loadbalancing)
* [Datadog GCP integration](https://docs.datadoghq.com/integrations/google_cloud_platform/)

253
cloud/gcp/lb/inputs.tf Normal file
View File

@ -0,0 +1,253 @@
#
# Datadog global variables
#
variable "environment" {
description = "Architecture environment"
type = "string"
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
#
# Filter variables
#
variable "project_id" {
type = "string"
description = "ID of the GCP Project"
}
#
# 4XX Errors
#
variable "error_rate_4xx_message" {
description = "Custom message for the GCP LB 4XX Errors monitor"
type = "string"
default = ""
}
variable "error_rate_4xx_time_aggregator" {
description = "Timeframe for the GCP LB 4XX Errors monitor"
type = "string"
default = "sum"
}
variable "error_rate_4xx_timeframe" {
description = "Timeframe for the GCP LB 4XX Errors monitor"
type = "string"
default = "last_5m"
}
variable "error_rate_4xx_threshold_critical" {
description = "Rate error in percentage (critical threshold)"
type = "string"
default = 50
}
variable "error_rate_4xx_silenced" {
description = "Groups to mute for GCP LB 4XX Errors monitor"
type = "map"
default = {}
}
variable "error_rate_4xx_extra_tags" {
description = "Extra tags for GCP LB 4XX Errors monitor"
type = "list"
default = []
}
#
# 5XX Errors
#
variable "error_rate_5xx_message" {
description = "Custom message for the GCP LB 5XX Errors monitor"
type = "string"
default = ""
}
variable "error_rate_5xx_time_aggregator" {
description = "Timeframe for the GCP LB 5XX Errors monitor"
type = "string"
default = "sum"
}
variable "error_rate_5xx_timeframe" {
description = "Timeframe for the GCP LB 5XX Errors monitor"
type = "string"
default = "last_5m"
}
variable "error_rate_5xx_threshold_critical" {
description = "Rate error in percentage (critical threshold)"
type = "string"
default = 50
}
variable "error_rate_5xx_silenced" {
description = "Groups to mute for GCP LB 5XX Errors monitor"
type = "map"
default = {}
}
variable "error_rate_5xx_extra_tags" {
description = "Extra tags for GCP LB 5XX Errors monitor"
type = "list"
default = []
}
#
# Latency
#
variable "latency_message" {
description = "Custom message for the GCP LB Latency monitor"
type = "string"
default = ""
}
variable "latency_time_aggregator" {
description = "Timeframe for the GCP LB Latency monitor"
type = "string"
default = "min"
}
variable "latency_timeframe" {
description = "Timeframe for the GCP LB Latency monitor"
type = "string"
default = "last_10m"
}
variable "latency_threshold_warning" {
description = "Latency in seconds (warning threshold)"
type = "string"
default = 3000
}
variable "latency_threshold_critical" {
description = "Latency in seconds (critical threshold)"
type = "string"
default = 5000
}
variable "latency_silenced" {
description = "Groups to mute for GCP LB Latency monitor"
type = "map"
default = {}
}
variable "latency_extra_tags" {
description = "Extra tags for GCP LB Latency monitor"
type = "list"
default = []
}
#
# Latency Backend
#
variable "backend_latency_message" {
description = "Custom message for the GCP LB Backend Latency monitor"
type = "string"
default = ""
}
variable "backend_latency_time_aggregator" {
description = "Timeframe for the GCP LB Backend Latency monitor"
type = "string"
default = "min"
}
variable "backend_latency_timeframe" {
description = "Timeframe for the GCP LB Backend Latency monitor"
type = "string"
default = "last_10m"
}
variable "backend_latency_threshold_warning" {
description = "Latency in seconds (warning threshold)"
type = "string"
default = 2000
}
variable "backend_latency_threshold_critical" {
description = "Latency in seconds (critical threshold)"
type = "string"
default = 4000
}
variable "backend_latency_silenced" {
description = "Groups to mute for GCP LB Backend Latency monitor"
type = "map"
default = {}
}
variable "backend_latency_extra_tags" {
description = "Extra tags for GCP LB Backend Latency monitor"
type = "list"
default = []
}
#
# Request Count
#
variable "request_count_message" {
description = "Custom message for the GCP LB Request Count monitor"
type = "string"
default = ""
}
variable "request_count_time_aggregator" {
description = "Timeframe for the GCP LB Request Count monitor"
type = "string"
default = "sum"
}
variable "request_count_timeframe" {
description = "Timeframe for the GCP LB Request Count monitor"
type = "string"
default = "last_5m"
}
variable "request_count_timeshift" {
description = "Timeshift for the GCP LB Request Count monitor"
type = "string"
default = "last_5m"
}
variable "request_count_threshold_warning" {
description = "Desviation in percentage (warning threshold)"
type = "string"
default = 250
}
variable "request_count_threshold_critical" {
description = "Desviation in percentage (critical threshold)"
type = "string"
default = 500
}
variable "request_count_silenced" {
description = "Groups to mute for GCP LB Request Count monitor"
type = "map"
default = {}
}
variable "request_count_extra_tags" {
description = "Extra tags for GCP LB Request Count monitor"
type = "list"
default = []
}

229
cloud/gcp/lb/monitors-lb.tf Normal file
View File

@ -0,0 +1,229 @@
#
# FILTER
#
data "template_file" "filter" {
template = "$${filter}"
vars {
filter = "${var.filter_tags_use_defaults == "true" ?
format("project_id:%s", var.project_id) :
"${var.filter_tags_custom}"}"
}
}
#
# 4XX Errors
#
resource "datadog_monitor" "error_rate_4xx" {
name = "[${var.environment}] GCP LB 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.error_rate_4xx_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.error_rate_4xx_time_aggregator}(${var.error_rate_4xx_timeframe}):
avg:gcp.loadbalancing.https.request_count{${data.template_file.filter.rendered},response_code_class:400} by {backend_target_name}.as_count().fill(zero)
/
(avg:gcp.loadbalancing.https.request_count{${data.template_file.filter.rendered}} by {backend_target_name}.as_count().fill(zero) + 5 ) * 100
> ${var.error_rate_4xx_threshold_critical}
EOF
thresholds {
critical = "${var.error_rate_4xx_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.error_rate_4xx_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"resource:lb",
"env:${var.environment}",
"created_by:terraform",
"${var.error_rate_4xx_extra_tags}",
]
}
#
# 5XX Errors
#
resource "datadog_monitor" "error_rate_5xx" {
name = "[${var.environment}] GCP LB 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.error_rate_5xx_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.error_rate_5xx_time_aggregator}(${var.error_rate_5xx_timeframe}):
avg:gcp.loadbalancing.https.request_count{${data.template_file.filter.rendered},response_code_class:400} by {backend_target_name}.as_count().fill(zero)
/
(avg:gcp.loadbalancing.https.request_count{${data.template_file.filter.rendered}} by {backend_target_name}.as_count().fill(zero) + 5 ) * 100
> ${var.error_rate_5xx_threshold_critical}
EOF
thresholds {
critical = "${var.error_rate_5xx_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.error_rate_5xx_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"resource:lb",
"env:${var.environment}",
"created_by:terraform",
"${var.error_rate_5xx_extra_tags}",
]
}
#
# Latency
#
resource "datadog_monitor" "latency" {
name = "[${var.environment}] GCP LB latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${coalesce(var.latency_message, var.message)}"
type = "query alert"
query = <<EOF
${var.latency_time_aggregator}(${var.latency_timeframe}):
min:gcp.loadbalancing.https.total_latencies.avg{${data.template_file.filter.rendered}} by {backend_target_name}
> ${var.latency_threshold_critical}
EOF
thresholds {
warning = "${var.latency_threshold_warning}"
critical = "${var.latency_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.latency_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"resource:lb",
"env:${var.environment}",
"created_by:terraform",
"${var.latency_extra_tags}",
]
}
#
# Backend Latency
#
resource "datadog_monitor" "backend_latency" {
name = "[${var.environment}] GCP LB backend latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${coalesce(var.backend_latency_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.backend_latency_time_aggregator}(${var.backend_latency_timeframe}):
min:gcp.loadbalancing.https.backend_latencies.avg{${data.template_file.filter.rendered}} by {backend_target_name}
> ${var.backend_latency_threshold_critical}
EOF
thresholds {
warning = "${var.backend_latency_threshold_warning}"
critical = "${var.backend_latency_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.backend_latency_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"resource:lb",
"env:${var.environment}",
"created_by:terraform",
"${var.backend_latency_extra_tags}",
]
}
#
# Request Count
#
resource "datadog_monitor" "request_count" {
name = "[${var.environment}] GCP LB Requests count increased abruptly"
message = "${coalesce(var.request_count_message, var.message)}"
type = "query alert"
query = <<EOF
pct_change(${var.request_count_time_aggregator}(${var.request_count_timeframe}),${var.request_count_timeshift}):
avg:gcp.loadbalancing.https.request_count{${data.template_file.filter.rendered}} by {backend_target_name}.as_count().fill(zero)
> ${var.request_count_threshold_critical}
EOF
thresholds {
warning = "${var.request_count_threshold_warning}"
critical = "${var.request_count_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.request_count_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"resource:lb",
"env:${var.environment}",
"created_by:terraform",
"${var.request_count_extra_tags}",
]
}

24
cloud/gcp/lb/outputs.tf Normal file
View File

@ -0,0 +1,24 @@
output "error_rate_4xx_id" {
description = "id for monitor error_rate_4xx"
value = "${datadog_monitor.error_rate_4xx.id}"
}
output "error_rate_5xx_id" {
description = "id for monitor error_rate_5xx"
value = "${datadog_monitor.error_rate_5xx.id}"
}
output "latency_id" {
description = "id for monitor latency"
value = "${datadog_monitor.latency.id}"
}
output "backend_latency_id" {
description = "id for monitor backend_latency"
value = "${datadog_monitor.backend_latency.id}"
}
output "request_count_id" {
description = "id for monitor request_count"
value = "${datadog_monitor.request_count.id}"
}