diff --git a/README.md b/README.md index e7bfce6..365eff3 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/) - [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/) - [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/) + - [beanstalk](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/beanstalk/) - [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/) - [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/) - [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/) diff --git a/cloud/aws/beanstalk/README.md b/cloud/aws/beanstalk/README.md new file mode 100644 index 0000000..0ff21cc --- /dev/null +++ b/cloud/aws/beanstalk/README.md @@ -0,0 +1,79 @@ +# CLOUD AWS BEANSTALK DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-aws-beanstalk" { + source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/beanstalk?ref={revision}" + + environment = var.environment + message = module.datadog-message-alerting.alerting-message +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Beanstalk Application 5xx error rate +- Beanstalk Application latency p90 +- Beanstalk Environment health +- Beanstalk Instance root file system usage + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| application\_5xx\_error\_rate\_enabled | Flag to enable Beanstalk application 5xx error ratemonitor | string | `"true"` | no | +| application\_5xx\_error\_rate\_extra\_tags | Extra tags for application 5xx error rate monitor | list(string) | `[]` | no | +| application\_5xx\_error\_rate\_message | Custom message for application 5xx error rate | string | `""` | no | +| application\_5xx\_error\_rate\_threshold\_critical | 5xx Error rate critical threshold in percent | string | `"5"` | no | +| application\_5xx\_error\_rate\_threshold\_warning | 5xx Error rate warning threshold in percent | string | `"3"` | no | +| application\_5xx\_error\_rate\_time\_aggregator | Monitor aggregator for beanstalk application 5xx error rate [available values: min, max or avg] | string | `"sum"` | no | +| application\_5xx\_error\_rate\_timeframe | Monitor timeframe for beanstalk application 5xx error rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | +| application\_latency\_p90\_enabled | Flag to enable Beanstalk application latency P90 monitor | string | `"true"` | no | +| application\_latency\_p90\_extra\_tags | Extra tags for application latency P90 monitor | list(string) | `[]` | no | +| application\_latency\_p90\_message | Custom message for application latency P90 monitor | string | `""` | no | +| application\_latency\_p90\_threshold\_critical | P90 Latency critical threshold in seconds | string | `"0.5"` | no | +| application\_latency\_p90\_threshold\_warning | P90 Latency warning threshold in seconds | string | `"0.3"` | no | +| application\_latency\_p90\_time\_aggregator | Monitor aggregator for beanstalk application latency P90 [available values: min, max or avg] | string | `"min"` | no | +| application\_latency\_p90\_timeframe | Monitor timeframe for beanstalk application latency P90 [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | +| environment | Architecture Environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| health\_enabled | Flag to enable Beanstalk Health monitor | string | `"true"` | no | +| health\_extra\_tags | Extra tags for health monitor | list(string) | `[]` | no | +| health\_message | Custom message for health monitor | string | `""` | no | +| health\_threshold\_critical | Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation) | string | `"20"` | no | +| health\_threshold\_warning | Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation) | string | `"15"` | no | +| health\_time\_aggregator | Monitor aggregator for beanstalk health [available values: min, max or avg] | string | `"min"` | no | +| health\_timeframe | Monitor timeframe for beanstalk health [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no | +| message | Message sent when an alert is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | +| root\_filesystem\_usage\_aggregator | Monitor aggregator for beanstalk instance file system usage [available values: min, max or avg] | string | `"max"` | no | +| root\_filesystem\_usage\_enabled | Flag to enable Beanstalk instance file system usage monitor | string | `"true"` | no | +| root\_filesystem\_usage\_extra\_tags | Extra tags for file system usage monitor | list(string) | `[]` | no | +| root\_filesystem\_usage\_message | Custom message for application file system usage | string | `""` | no | +| root\_filesystem\_usage\_threshold\_critical | File system usage critical threshold in percent | string | `"90"` | no | +| root\_filesystem\_usage\_threshold\_warning | File system usage warning threshold in percent | string | `"80"` | no | +| root\_filesystem\_usage\_timeframe | Monitor timeframe for beanstalk instance file system usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| root\_filesystem\_usage\_timeout\_h | File system usage auto-resolving state (in hours) | string | `"0"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| application\_5xx\_error\_rate\_id | id for monitor application_5xx_error_rate | +| application\_latency\_p90\_id | id for monitor application_latency_p90 | +| health\_id | id for monitor health | +| root\_filesystem\_usage\_id | id for monitor root_filesystem_usage | + +## Related documentation + +Datadog documentation: [https://docs.datadoghq.com/integrations/amazon_elasticbeanstalk/](https://docs.datadoghq.com/integrations/amazon_elasticbeanstalk/) + +AWS Beanstalk Environment monitoring : [https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/environments-health.html](https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/environments-health.html) diff --git a/cloud/aws/beanstalk/inputs.tf b/cloud/aws/beanstalk/inputs.tf new file mode 100644 index 0000000..cfd5c7d --- /dev/null +++ b/cloud/aws/beanstalk/inputs.tf @@ -0,0 +1,208 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = string +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "prefix_slug" { + description = "Prefix string to prepend between brackets on every monitors names" + default = "" +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +# AWS Beanstalk monitor variables + +variable "health_enabled" { + description = "Flag to enable Beanstalk Health monitor" + type = string + default = "true" +} + +variable "health_message" { + description = "Custom message for health monitor" + default = "" +} + +variable "health_time_aggregator" { + description = "Monitor aggregator for beanstalk health [available values: min, max or avg]" + type = string + default = "min" +} + +variable "health_timeframe" { + description = "Monitor timeframe for beanstalk health [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_10m" +} + +variable "health_threshold_critical" { + description = "Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation)" + default = 20 +} + +variable "health_threshold_warning" { + description = "Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation)" + default = 15 +} + +variable "health_extra_tags" { + description = "Extra tags for health monitor" + type = list(string) + default = [] +} + +variable "application_latency_p90_enabled" { + description = "Flag to enable Beanstalk application latency P90 monitor" + type = string + default = "true" +} + +variable "application_latency_p90_message" { + description = "Custom message for application latency P90 monitor" + default = "" +} + +variable "application_latency_p90_time_aggregator" { + description = "Monitor aggregator for beanstalk application latency P90 [available values: min, max or avg]" + type = string + default = "min" +} + +variable "application_latency_p90_timeframe" { + description = "Monitor timeframe for beanstalk application latency P90 [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "application_latency_p90_threshold_critical" { + description = "P90 Latency critical threshold in seconds" + default = 0.5 +} + +variable "application_latency_p90_threshold_warning" { + description = "P90 Latency warning threshold in seconds" + type = string + default = 0.3 +} + +variable "application_latency_p90_extra_tags" { + description = "Extra tags for application latency P90 monitor" + type = list(string) + default = [] +} + +variable "application_5xx_error_rate_enabled" { + description = "Flag to enable Beanstalk application 5xx error ratemonitor" + type = string + default = "true" +} + +variable "application_5xx_error_rate_message" { + description = "Custom message for application 5xx error rate" + default = "" +} + +variable "application_5xx_error_rate_time_aggregator" { + description = "Monitor aggregator for beanstalk application 5xx error rate [available values: min, max or avg]" + type = string + default = "sum" +} + +variable "application_5xx_error_rate_timeframe" { + description = "Monitor timeframe for beanstalk application 5xx error rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "application_5xx_error_rate_threshold_critical" { + description = "5xx Error rate critical threshold in percent" + default = 5 +} + +variable "application_5xx_error_rate_threshold_warning" { + description = "5xx Error rate warning threshold in percent" + type = string + default = 3 +} + +variable "application_5xx_error_rate_extra_tags" { + description = "Extra tags for application 5xx error rate monitor" + type = list(string) + default = [] +} + +variable "root_filesystem_usage_enabled" { + description = "Flag to enable Beanstalk instance file system usage monitor" + type = string + default = "true" +} + +variable "root_filesystem_usage_message" { + description = "Custom message for application file system usage" + default = "" +} + +variable "root_filesystem_usage_aggregator" { + description = "Monitor aggregator for beanstalk instance file system usage [available values: min, max or avg]" + type = string + default = "max" +} + +variable "root_filesystem_usage_timeframe" { + description = "Monitor timeframe for beanstalk instance file system usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "root_filesystem_usage_threshold_critical" { + description = "File system usage critical threshold in percent" + type = string + default = 90 +} + +variable "root_filesystem_usage_threshold_warning" { + description = "File system usage warning threshold in percent" + type = string + default = 80 +} + +variable "root_filesystem_usage_timeout_h" { + description = "File system usage auto-resolving state (in hours)" + default = 0 +} + +variable "root_filesystem_usage_extra_tags" { + description = "Extra tags for file system usage monitor" + type = list(string) + default = [] +} + diff --git a/cloud/aws/beanstalk/modules.tf b/cloud/aws/beanstalk/modules.tf new file mode 100644 index 0000000..560dd86 --- /dev/null +++ b/cloud/aws/beanstalk/modules.tf @@ -0,0 +1,34 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "aws_beanstalk" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded +} + +# With AWS beanstalk some metrics are send per host and per beanstalk env. +# This is particularly the case for all the ApplicationLatency metrics and +# the ApplicationRequests (not for the health and the cpu/disk metrics). +# The best way to find this out is to go on the monitoring configuration page +# of your beanstalk environment. +# +# In order to differentiate those metrics we need to do some exclusion to +# to find out which values has been sent for the host and the one sent for +# the environment itself. +# Some automatic tags are added on the instances by AWS, this seems to be +# the only way to filter at the moment. +# +# This filter exclude the metrics sent for the hosts. +module "filter-tags-no-host" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "aws_beanstalk" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags_excluded = ["aws_cloudformation_logical-id:awsebautoscalinggroup"] +} + diff --git a/cloud/aws/beanstalk/monitors-beanstalk.tf b/cloud/aws/beanstalk/monitors-beanstalk.tf new file mode 100644 index 0000000..a9e3232 --- /dev/null +++ b/cloud/aws/beanstalk/monitors-beanstalk.tf @@ -0,0 +1,125 @@ +### Beanstalk environment health ### +resource "datadog_monitor" "health" { + count = var.health_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Environment health {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}} : either degraded or severe){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}} : warning){{/is_warning}}" + message = coalesce(var.health_message, var.message) + type = "metric alert" + + query = <= ${var.health_threshold_critical} +EOQ + + thresholds = { + critical = var.health_threshold_critical + warning = var.health_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + require_full_window = false + locked = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.health_extra_tags) + + lifecycle { + ignore_changes = [silenced] + } +} + +resource "datadog_monitor" "application_latency_p90" { + count = var.application_latency_p90_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Application latency p90 {{#is_alert}}{{{comparator}}} {{threshold}}sec ({{value}}sec){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}sec ({{value}}sec){{/is_warning}}" + message = coalesce(var.application_latency_p90_message, var.message) + type = "metric alert" + + query = <= ${var.application_latency_p90_threshold_critical} +EOQ + + thresholds = { + critical = var.application_latency_p90_threshold_critical + warning = var.application_latency_p90_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + require_full_window = false + locked = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.application_latency_p90_extra_tags) + + lifecycle { + ignore_changes = [silenced] + } +} + +resource "datadog_monitor" "application_5xx_error_rate" { + count = var.application_5xx_error_rate_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Application 5xx error rate {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.application_5xx_error_rate_message, var.message) + type = "query alert" + + query = < ${var.application_5xx_error_rate_threshold_critical} +EOQ + + thresholds = { + critical = var.application_5xx_error_rate_threshold_critical + warning = var.application_5xx_error_rate_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + require_full_window = false + locked = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.application_5xx_error_rate_extra_tags) + + lifecycle { + ignore_changes = [silenced] + } +} + +resource "datadog_monitor" "root_filesystem_usage" { + count = var.root_filesystem_usage_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Instance root file system usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.root_filesystem_usage_message, var.message) + type = "metric alert" + + query = < ${var.root_filesystem_usage_threshold_critical} +EOQ + + thresholds = { + critical = var.root_filesystem_usage_threshold_critical + warning = var.root_filesystem_usage_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + notify_audit = false + timeout_h = var.root_filesystem_usage_timeout_h + include_tags = true + require_full_window = false + locked = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.root_filesystem_usage_extra_tags) + + lifecycle { + ignore_changes = [silenced] + } +} + diff --git a/cloud/aws/beanstalk/outputs.tf b/cloud/aws/beanstalk/outputs.tf new file mode 100644 index 0000000..ad6d3ed --- /dev/null +++ b/cloud/aws/beanstalk/outputs.tf @@ -0,0 +1,20 @@ +output "application_5xx_error_rate_id" { + description = "id for monitor application_5xx_error_rate" + value = datadog_monitor.application_5xx_error_rate.*.id +} + +output "application_latency_p90_id" { + description = "id for monitor application_latency_p90" + value = datadog_monitor.application_latency_p90.*.id +} + +output "health_id" { + description = "id for monitor health" + value = datadog_monitor.health.*.id +} + +output "root_filesystem_usage_id" { + description = "id for monitor root_filesystem_usage" + value = datadog_monitor.root_filesystem_usage.*.id +} +