From 980f9cbd7fe85f1fa3729c3000f2a33f1934f2c3 Mon Sep 17 00:00:00 2001 From: Boris Rousseau Date: Mon, 22 Jan 2018 11:11:55 +0100 Subject: [PATCH 1/5] MON-48: monitoring for api gateway --- cloud/aws/apigateway/README.md | 46 +++++++++++++++ cloud/aws/apigateway/inputs.tf | 60 +++++++++++++++++++ cloud/aws/apigateway/monitors-api.tf | 87 ++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 cloud/aws/apigateway/README.md create mode 100644 cloud/aws/apigateway/inputs.tf create mode 100644 cloud/aws/apigateway/monitors-api.tf diff --git a/cloud/aws/apigateway/README.md b/cloud/aws/apigateway/README.md new file mode 100644 index 0000000..1dab021 --- /dev/null +++ b/cloud/aws/apigateway/README.md @@ -0,0 +1,46 @@ +AWS API Gateway DataDog monitors +========================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-aws-api-gateway" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/apigateway?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +Purpose +------- +Creates DataDog monitors with the following checks : + +* API Gateway too much 5xx errors +* API Gateway too much 4xx errors +* API Gateway latency to high + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags | Tags used for custom filtering | string | `*` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| latency_threshold_critical | Alerting threshold in miliseconds | string | `800` | no | +| latency_threshold_warning | Warning threshold in miliseconds | string | `400` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_api_gateway/](https://docs.datadoghq.com/integrations/amazon_api_gateway/) + +AWS API Gateway metrics documentation: [https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/api-gateway-metrics-dimensions.html](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/api-gateway-metrics-dimensions.html) diff --git a/cloud/aws/apigateway/inputs.tf b/cloud/aws/apigateway/inputs.tf new file mode 100644 index 0000000..98b5067 --- /dev/null +++ b/cloud/aws/apigateway/inputs.tf @@ -0,0 +1,60 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +################################### +### LATENCY VARIABLES ### +################################### + +variable "latency_threshold_critical" { + default = 800 + description = "Alerting threshold in milliseconds" +} + +variable "latency_threshold_warning" { + default = 400 + description = "Warning threshold in milliseconds" +} + +################################# +### HTTP 5xx status pages ### +################################# + +variable "http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 10 + description = "Maximum warning acceptable percent of 5xx errors" +} + +################################# +### HTTP 4xx status pages ### +################################# + +variable "http_4xx_requests_threshold_critical" { + default = 30 + description = "Maximum critical acceptable percent of 4xx errors" +} + +variable "http_4xx_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 4xx errors" +} diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf new file mode 100644 index 0000000..c0cfc79 --- /dev/null +++ b/cloud/aws/apigateway/monitors-api.tf @@ -0,0 +1,87 @@ +# Monitoring Api Gateway latency +resource "datadog_monitor" "API_Gateway_latency" { + name = "[${var.environment}] API Gateway latency > ${var.latency_threshold_critical}" + type = "metric alert" + message = "${var.message}" + query = < ${var.latency_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.latency_threshold_warning}" + critical = "${var.latency_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] +} + +# Monitoring API Gateway 5xx errors percent +resource "datadog_monitor" "API_http_5xx_errors_count" { + name = "[${var.environment}] API Gateway HTTP 5xx errors > ${var.http_5xx_requests_threshold_critical}%" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_5xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] +} + +# Monitoring API Gateway 4xx errors percent +resource "datadog_monitor" "API_http_4xx_errors_count" { + name = "[${var.environment}] API Gateway HTTP 4xx errors > ${var.http_4xx_requests_threshold_critical}%" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_4xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_4xx_requests_threshold_warning}" + critical = "${var.http_4xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] +} From a11b7598e844cb4b74838a0502099c736fee4e96 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Fri, 23 Feb 2018 10:31:05 +0100 Subject: [PATCH 2/5] MON-48 - Updated with new best practice --- cloud/aws/apigateway/monitors-api.tf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf index c0cfc79..1e66ad2 100644 --- a/cloud/aws/apigateway/monitors-api.tf +++ b/cloud/aws/apigateway/monitors-api.tf @@ -1,8 +1,9 @@ # Monitoring Api Gateway latency resource "datadog_monitor" "API_Gateway_latency" { - name = "[${var.environment}] API Gateway latency > ${var.latency_threshold_critical}" + name = "[${var.environment}] API Gateway latency {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" + query = < Date: Mon, 26 Feb 2018 09:53:05 +0100 Subject: [PATCH 3/5] MON-48: Added artificial_requests_count to mitigate false positive in case of low trafic MON-48: format issue MON-48: format issue MON-48: changed variable name MON-48: changed variable name MON-48: renamed to artificial_requests_count --- cloud/aws/apigateway/README.md | 3 ++- cloud/aws/apigateway/inputs.tf | 7 ++++++- cloud/aws/apigateway/monitors-api.tf | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cloud/aws/apigateway/README.md b/cloud/aws/apigateway/README.md index 1dab021..439aae4 100644 --- a/cloud/aws/apigateway/README.md +++ b/cloud/aws/apigateway/README.md @@ -28,7 +28,7 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `900` | no | -| environment | Architecture environment | string | - | yes | +| environment | Environment | string | - | yes | | filter_tags | Tags used for custom filtering | string | `*` | no | | http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | | http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | @@ -37,6 +37,7 @@ Inputs | message | Message sent when a monitor is triggered | string | - | yes | | latency_threshold_critical | Alerting threshold in miliseconds | string | `800` | no | | latency_threshold_warning | Warning threshold in miliseconds | string | `400` | no | +| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `0` | no | Related documentation --------------------- diff --git a/cloud/aws/apigateway/inputs.tf b/cloud/aws/apigateway/inputs.tf index 98b5067..a38f791 100644 --- a/cloud/aws/apigateway/inputs.tf +++ b/cloud/aws/apigateway/inputs.tf @@ -1,5 +1,5 @@ variable "environment" { - description = "Architecture environment" + description = "Environment" type = "string" } @@ -58,3 +58,8 @@ variable "http_4xx_requests_threshold_warning" { default = 15 description = "Maximum warning acceptable percent of 4xx errors" } + +variable "artificial_requests_count" { + default = 0 + description = "Number of false requests used to mitigate false positive in case of low trafic" +} diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf index 1e66ad2..85fcef1 100644 --- a/cloud/aws/apigateway/monitors-api.tf +++ b/cloud/aws/apigateway/monitors-api.tf @@ -36,7 +36,7 @@ resource "datadog_monitor" "API_http_5xx_errors_count" { query = < ${var.http_5xx_requests_threshold_critical} EOF @@ -66,7 +66,7 @@ resource "datadog_monitor" "API_http_4xx_errors_count" { query = < ${var.http_4xx_requests_threshold_critical} EOF From 5797379a1bda69567205cafa2eba14fb19d684a2 Mon Sep 17 00:00:00 2001 From: Boris Rousseau Date: Fri, 2 Mar 2018 14:50:25 +0100 Subject: [PATCH 4/5] MON-48: set the artificial_requests_count default value to 5 --- cloud/aws/apigateway/inputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/aws/apigateway/inputs.tf b/cloud/aws/apigateway/inputs.tf index a38f791..780f754 100644 --- a/cloud/aws/apigateway/inputs.tf +++ b/cloud/aws/apigateway/inputs.tf @@ -60,6 +60,6 @@ variable "http_4xx_requests_threshold_warning" { } variable "artificial_requests_count" { - default = 0 + default = 5 description = "Number of false requests used to mitigate false positive in case of low trafic" } From 9fccecd6b2624b21f2b347d31404e34b46516688 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Fri, 2 Mar 2018 15:00:27 +0100 Subject: [PATCH 5/5] MON-48 - Use ms instead of percent for latency --- cloud/aws/apigateway/monitors-api.tf | 2 +- cloud/aws/elb/monitors-elb.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf index 85fcef1..2ed22ad 100644 --- a/cloud/aws/apigateway/monitors-api.tf +++ b/cloud/aws/apigateway/monitors-api.tf @@ -1,6 +1,6 @@ # Monitoring Api Gateway latency resource "datadog_monitor" "API_Gateway_latency" { - name = "[${var.environment}] API Gateway latency {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] API Gateway latency {{comparator}} {{#is_alert}}{{threshold}}ms{{/is_alert}}{{#is_warning}}{{warn_threshold}}ms{{/is_warning}} ({{value}}ms)" type = "metric alert" message = "${var.message}" diff --git a/cloud/aws/elb/monitors-elb.tf b/cloud/aws/elb/monitors-elb.tf index 29c7736..3d412ad 100644 --- a/cloud/aws/elb/monitors-elb.tf +++ b/cloud/aws/elb/monitors-elb.tf @@ -161,7 +161,7 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" { } resource "datadog_monitor" "ELB_backend_latency" { - name = "[${var.environment}] ELB latency too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ELB latency too high {{comparator}} {{#is_alert}}{{threshold}}s{{/is_alert}}{{#is_warning}}{{warn_threshold}}s{{/is_warning}} ({{value}}s)" message = "${var.message}" query = <