Merge branch 'MON-525_monitors-beanstalk' into 'master'

MON-525 "Monitors beanstalk"

Closes MON-525

See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!136
This commit is contained in:
Quentin Manfroi 2019-11-21 15:39:49 +01:00
commit a4143a334f
6 changed files with 467 additions and 0 deletions

View File

@ -140,6 +140,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/) - [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/)
- [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/) - [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/)
- [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/) - [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/)
- [beanstalk](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/beanstalk/)
- [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/) - [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/)
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/) - [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/)
- [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/) - [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/)

View File

@ -0,0 +1,79 @@
# CLOUD AWS BEANSTALK DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-aws-beanstalk" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/beanstalk?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Beanstalk Application 5xx error rate
- Beanstalk Application latency p90
- Beanstalk Environment health
- Beanstalk Instance root file system usage
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| application\_5xx\_error\_rate\_enabled | Flag to enable Beanstalk application 5xx error ratemonitor | string | `"true"` | no |
| application\_5xx\_error\_rate\_extra\_tags | Extra tags for application 5xx error rate monitor | list(string) | `[]` | no |
| application\_5xx\_error\_rate\_message | Custom message for application 5xx error rate | string | `""` | no |
| application\_5xx\_error\_rate\_threshold\_critical | 5xx Error rate critical threshold in percent | string | `"5"` | no |
| application\_5xx\_error\_rate\_threshold\_warning | 5xx Error rate warning threshold in percent | string | `"3"` | no |
| application\_5xx\_error\_rate\_time\_aggregator | Monitor aggregator for beanstalk application 5xx error rate [available values: min, max or avg] | string | `"sum"` | no |
| application\_5xx\_error\_rate\_timeframe | Monitor timeframe for beanstalk application 5xx error rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| application\_latency\_p90\_enabled | Flag to enable Beanstalk application latency P90 monitor | string | `"true"` | no |
| application\_latency\_p90\_extra\_tags | Extra tags for application latency P90 monitor | list(string) | `[]` | no |
| application\_latency\_p90\_message | Custom message for application latency P90 monitor | string | `""` | no |
| application\_latency\_p90\_threshold\_critical | P90 Latency critical threshold in seconds | string | `"0.5"` | no |
| application\_latency\_p90\_threshold\_warning | P90 Latency warning threshold in seconds | string | `"0.3"` | no |
| application\_latency\_p90\_time\_aggregator | Monitor aggregator for beanstalk application latency P90 [available values: min, max or avg] | string | `"min"` | no |
| application\_latency\_p90\_timeframe | Monitor timeframe for beanstalk application latency P90 [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| environment | Architecture Environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| health\_enabled | Flag to enable Beanstalk Health monitor | string | `"true"` | no |
| health\_extra\_tags | Extra tags for health monitor | list(string) | `[]` | no |
| health\_message | Custom message for health monitor | string | `""` | no |
| health\_threshold\_critical | Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation) | string | `"20"` | no |
| health\_threshold\_warning | Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation) | string | `"15"` | no |
| health\_time\_aggregator | Monitor aggregator for beanstalk health [available values: min, max or avg] | string | `"min"` | no |
| health\_timeframe | Monitor timeframe for beanstalk health [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no |
| message | Message sent when an alert is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| root\_filesystem\_usage\_aggregator | Monitor aggregator for beanstalk instance file system usage [available values: min, max or avg] | string | `"max"` | no |
| root\_filesystem\_usage\_enabled | Flag to enable Beanstalk instance file system usage monitor | string | `"true"` | no |
| root\_filesystem\_usage\_extra\_tags | Extra tags for file system usage monitor | list(string) | `[]` | no |
| root\_filesystem\_usage\_message | Custom message for application file system usage | string | `""` | no |
| root\_filesystem\_usage\_threshold\_critical | File system usage critical threshold in percent | string | `"90"` | no |
| root\_filesystem\_usage\_threshold\_warning | File system usage warning threshold in percent | string | `"80"` | no |
| root\_filesystem\_usage\_timeframe | Monitor timeframe for beanstalk instance file system usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| root\_filesystem\_usage\_timeout\_h | File system usage auto-resolving state (in hours) | string | `"0"` | no |
## Outputs
| Name | Description |
|------|-------------|
| application\_5xx\_error\_rate\_id | id for monitor application_5xx_error_rate |
| application\_latency\_p90\_id | id for monitor application_latency_p90 |
| health\_id | id for monitor health |
| root\_filesystem\_usage\_id | id for monitor root_filesystem_usage |
## Related documentation
Datadog documentation: [https://docs.datadoghq.com/integrations/amazon_elasticbeanstalk/](https://docs.datadoghq.com/integrations/amazon_elasticbeanstalk/)
AWS Beanstalk Environment monitoring : [https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/environments-health.html](https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/environments-health.html)

View File

@ -0,0 +1,208 @@
# Global Terraform
variable "environment" {
description = "Architecture Environment"
type = string
}
# Global DataDog
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "prefix_slug" {
description = "Prefix string to prepend between brackets on every monitors names"
default = ""
}
variable "message" {
description = "Message sent when an alert is triggered"
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
# AWS Beanstalk monitor variables
variable "health_enabled" {
description = "Flag to enable Beanstalk Health monitor"
type = string
default = "true"
}
variable "health_message" {
description = "Custom message for health monitor"
default = ""
}
variable "health_time_aggregator" {
description = "Monitor aggregator for beanstalk health [available values: min, max or avg]"
type = string
default = "min"
}
variable "health_timeframe" {
description = "Monitor timeframe for beanstalk health [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_10m"
}
variable "health_threshold_critical" {
description = "Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation)"
default = 20
}
variable "health_threshold_warning" {
description = "Health critical threshold (see the `aws.elasticbeanstalk.environment_health` values in the Datadog documentation)"
default = 15
}
variable "health_extra_tags" {
description = "Extra tags for health monitor"
type = list(string)
default = []
}
variable "application_latency_p90_enabled" {
description = "Flag to enable Beanstalk application latency P90 monitor"
type = string
default = "true"
}
variable "application_latency_p90_message" {
description = "Custom message for application latency P90 monitor"
default = ""
}
variable "application_latency_p90_time_aggregator" {
description = "Monitor aggregator for beanstalk application latency P90 [available values: min, max or avg]"
type = string
default = "min"
}
variable "application_latency_p90_timeframe" {
description = "Monitor timeframe for beanstalk application latency P90 [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_15m"
}
variable "application_latency_p90_threshold_critical" {
description = "P90 Latency critical threshold in seconds"
default = 0.5
}
variable "application_latency_p90_threshold_warning" {
description = "P90 Latency warning threshold in seconds"
type = string
default = 0.3
}
variable "application_latency_p90_extra_tags" {
description = "Extra tags for application latency P90 monitor"
type = list(string)
default = []
}
variable "application_5xx_error_rate_enabled" {
description = "Flag to enable Beanstalk application 5xx error ratemonitor"
type = string
default = "true"
}
variable "application_5xx_error_rate_message" {
description = "Custom message for application 5xx error rate"
default = ""
}
variable "application_5xx_error_rate_time_aggregator" {
description = "Monitor aggregator for beanstalk application 5xx error rate [available values: min, max or avg]"
type = string
default = "sum"
}
variable "application_5xx_error_rate_timeframe" {
description = "Monitor timeframe for beanstalk application 5xx error rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_15m"
}
variable "application_5xx_error_rate_threshold_critical" {
description = "5xx Error rate critical threshold in percent"
default = 5
}
variable "application_5xx_error_rate_threshold_warning" {
description = "5xx Error rate warning threshold in percent"
type = string
default = 3
}
variable "application_5xx_error_rate_extra_tags" {
description = "Extra tags for application 5xx error rate monitor"
type = list(string)
default = []
}
variable "root_filesystem_usage_enabled" {
description = "Flag to enable Beanstalk instance file system usage monitor"
type = string
default = "true"
}
variable "root_filesystem_usage_message" {
description = "Custom message for application file system usage"
default = ""
}
variable "root_filesystem_usage_aggregator" {
description = "Monitor aggregator for beanstalk instance file system usage [available values: min, max or avg]"
type = string
default = "max"
}
variable "root_filesystem_usage_timeframe" {
description = "Monitor timeframe for beanstalk instance file system usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_5m"
}
variable "root_filesystem_usage_threshold_critical" {
description = "File system usage critical threshold in percent"
type = string
default = 90
}
variable "root_filesystem_usage_threshold_warning" {
description = "File system usage warning threshold in percent"
type = string
default = 80
}
variable "root_filesystem_usage_timeout_h" {
description = "File system usage auto-resolving state (in hours)"
default = 0
}
variable "root_filesystem_usage_extra_tags" {
description = "Extra tags for file system usage monitor"
type = list(string)
default = []
}

View File

@ -0,0 +1,34 @@
module "filter-tags" {
source = "../../../common/filter-tags"
environment = var.environment
resource = "aws_beanstalk"
filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = var.filter_tags_custom_excluded
}
# With AWS beanstalk some metrics are send per host and per beanstalk env.
# This is particularly the case for all the ApplicationLatency metrics and
# the ApplicationRequests (not for the health and the cpu/disk metrics).
# The best way to find this out is to go on the monitoring configuration page
# of your beanstalk environment.
#
# In order to differentiate those metrics we need to do some exclusion to
# to find out which values has been sent for the host and the one sent for
# the environment itself.
# Some automatic tags are added on the instances by AWS, this seems to be
# the only way to filter at the moment.
#
# This filter exclude the metrics sent for the hosts.
module "filter-tags-no-host" {
source = "../../../common/filter-tags"
environment = var.environment
resource = "aws_beanstalk"
filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags_excluded = ["aws_cloudformation_logical-id:awsebautoscalinggroup"]
}

View File

@ -0,0 +1,125 @@
### Beanstalk environment health ###
resource "datadog_monitor" "health" {
count = var.health_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Environment health {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}} : either degraded or severe){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}} : warning){{/is_warning}}"
message = coalesce(var.health_message, var.message)
type = "metric alert"
query = <<EOQ
${var.health_time_aggregator}(${var.health_timeframe}):min:aws.elasticbeanstalk.environment_health${module.filter-tags.query_alert} by {region,elasticbeanstalk_environment-name} >= ${var.health_threshold_critical}
EOQ
thresholds = {
critical = var.health_threshold_critical
warning = var.health_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = false
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.health_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}
resource "datadog_monitor" "application_latency_p90" {
count = var.application_latency_p90_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Application latency p90 {{#is_alert}}{{{comparator}}} {{threshold}}sec ({{value}}sec){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}sec ({{value}}sec){{/is_warning}}"
message = coalesce(var.application_latency_p90_message, var.message)
type = "metric alert"
query = <<EOQ
${var.application_latency_p90_time_aggregator}(${var.application_latency_p90_timeframe}):min:aws.elasticbeanstalk.application_latency_p_9_0${module.filter-tags-no-host.query_alert} by {region,elasticbeanstalk_environment-name} >= ${var.application_latency_p90_threshold_critical}
EOQ
thresholds = {
critical = var.application_latency_p90_threshold_critical
warning = var.application_latency_p90_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = false
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.application_latency_p90_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}
resource "datadog_monitor" "application_5xx_error_rate" {
count = var.application_5xx_error_rate_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Application 5xx error rate {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.application_5xx_error_rate_message, var.message)
type = "query alert"
query = <<EOQ
${var.application_5xx_error_rate_time_aggregator}(${var.application_5xx_error_rate_timeframe}):sum:aws.elasticbeanstalk.application_requests_5xx${module.filter-tags-no-host.query_alert} by {region,elasticbeanstalk_environment-name}.as_rate() / sum:aws.elasticbeanstalk.application_requests_total${module.filter-tags-no-host.query_alert} by {region,elasticbeanstalk_environment-name}.as_rate() * 100 > ${var.application_5xx_error_rate_threshold_critical}
EOQ
thresholds = {
critical = var.application_5xx_error_rate_threshold_critical
warning = var.application_5xx_error_rate_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = false
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.application_5xx_error_rate_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}
resource "datadog_monitor" "root_filesystem_usage" {
count = var.root_filesystem_usage_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Beanstalk Instance root file system usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.root_filesystem_usage_message, var.message)
type = "metric alert"
query = <<EOQ
${var.root_filesystem_usage_aggregator}(${var.root_filesystem_usage_timeframe}):min:aws.elasticbeanstalk.root_filesystem_util${module.filter-tags.query_alert} by {region,elasticbeanstalk_environment-name,host} > ${var.root_filesystem_usage_threshold_critical}
EOQ
thresholds = {
critical = var.root_filesystem_usage_threshold_critical
warning = var.root_filesystem_usage_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
notify_audit = false
timeout_h = var.root_filesystem_usage_timeout_h
include_tags = true
require_full_window = false
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:beanstalk", "team:claranet", "created-by:terraform"], var.root_filesystem_usage_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}

View File

@ -0,0 +1,20 @@
output "application_5xx_error_rate_id" {
description = "id for monitor application_5xx_error_rate"
value = datadog_monitor.application_5xx_error_rate.*.id
}
output "application_latency_p90_id" {
description = "id for monitor application_latency_p90"
value = datadog_monitor.application_latency_p90.*.id
}
output "health_id" {
description = "id for monitor health"
value = datadog_monitor.health.*.id
}
output "root_filesystem_usage_id" {
description = "id for monitor root_filesystem_usage"
value = datadog_monitor.root_filesystem_usage.*.id
}