Add monitors related to ECS (common to fargate / EC2 cluster, and dedicated to EC2 cluster)

This commit is contained in:
David Drugeon Hamon 2019-10-18 18:34:53 +02:00 committed by Quentin Manfroi
parent 793d34e627
commit f65adb8ff5
11 changed files with 739 additions and 0 deletions

View File

@ -140,6 +140,9 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/)
- [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/)
- [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/)
- [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/)
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/)
- [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/)
- [elasticache](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/)
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/common/)
- [memcached](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/memcached/)

View File

@ -0,0 +1,67 @@
# CLOUD AWS ECS COMMON DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-aws-ecs-common" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/common?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
}
```
## Purpose
Creates DataDog monitors with the following checks:
- ECS Service CPU Utilization High (disabled by default)
- ECS Service Memory Utilization High (disabled by default)
- ECS Service not healthy enough
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags | Tags used for filtering | string | `"*"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| service\_cpu\_utilization\_enabled | Flag to enable Service CPU Utilization monitor | string | `"false"` | no |
| service\_cpu\_utilization\_extra\_tags | Extra tags for Service CPU Utilization monitor | list(string) | `[]` | no |
| service\_cpu\_utilization\_message | Custom message for the Service CPU Utilization monitor | string | `""` | no |
| service\_cpu\_utilization\_threshold\_critical | Critical threshold for the Service CPU Utilization monitor | string | `"90"` | no |
| service\_cpu\_utilization\_threshold\_warning | Warning threshold for the Service CPU Utilization monitor | string | `"80"` | no |
| service\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Service CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
| service\_cpu\_utilization\_timeframe | Timeframe for the Service CPU Utilization monitor | string | `"last_5m"` | no |
| service\_memory\_utilization\_enabled | Flag to enable Service Memory Utilization monitor | string | `"false"` | no |
| service\_memory\_utilization\_extra\_tags | Extra tags for Service Memory Utilization monitor | list(string) | `[]` | no |
| service\_memory\_utilization\_message | Custom message for the Service Memory Utilization monitor | string | `""` | no |
| service\_memory\_utilization\_threshold\_critical | Critical threshold for the Service Memory Utilization monitor | string | `"90"` | no |
| service\_memory\_utilization\_threshold\_warning | Warning threshold for the Service Memory Utilization monitor | string | `"85"` | no |
| service\_memory\_utilization\_time\_aggregator | Monitor aggregator for Service Memory Utilization [available values: min, max or avg] | string | `"min"` | no |
| service\_memory\_utilization\_timeframe | Timeframe for the Service Memory Utilization monitor | string | `"last_5m"` | no |
| service\_missing\_tasks\_enabled | Flag to enable Service Missing Tasks monitor | string | `"true"` | no |
| service\_missing\_tasks\_extra\_tags | Extra tags for Service Missing Tasks monitor | list(string) | `[]` | no |
| service\_missing\_tasks\_message | Custom message for the Service Missing Tasks monitor | string | `""` | no |
| service\_missing\_tasks\_threshold\_critical | Critical threshold for the Service Missing Tasks monitor | string | `"60"` | no |
| service\_missing\_tasks\_threshold\_warning | Warning threshold for the Service Missing Tasks monitor | string | `"80"` | no |
| service\_missing\_tasks\_time\_aggregator | Monitor aggregator for Service Missing Tasks [available values: min, max or avg] | string | `"min"` | no |
| service\_missing\_tasks\_timeframe | Timeframe for the Service Missing Tasks monitor | string | `"last_5m"` | no |
## Outputs
| Name | Description |
|------|-------------|
| service\_cpu\_utilization\_id | id for monitor service_cpu_utilization |
| service\_memory\_utilization\_id | id for monitor service_memory_utilization |
| service\_missing\_tasks\_id | id for monitor service_missing_tasks |
## Related documentation

View File

@ -0,0 +1,181 @@
#
# Datadog global variables
#
variable "environment" {
description = "Architecture environment"
type = string
}
variable "filter_tags" {
description = "Tags used for filtering"
default = "*"
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "prefix_slug" {
description = "Prefix string to prepend between brackets on every monitors names"
default = ""
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
#
# Service CPU Utilization
#
variable "service_cpu_utilization_enabled" {
description = "Flag to enable Service CPU Utilization monitor"
type = string
default = "false"
}
variable "service_cpu_utilization_extra_tags" {
description = "Extra tags for Service CPU Utilization monitor"
type = list(string)
default = []
}
variable "service_cpu_utilization_message" {
description = "Custom message for the Service CPU Utilization monitor"
type = string
default = ""
}
variable "service_cpu_utilization_timeframe" {
description = "Timeframe for the Service CPU Utilization monitor"
type = string
default = "last_5m"
}
variable "service_cpu_utilization_time_aggregator" {
description = "Monitor aggregator for Service CPU Utilization [available values: min, max or avg]"
type = string
default = "min"
}
variable "service_cpu_utilization_threshold_critical" {
description = "Critical threshold for the Service CPU Utilization monitor"
type = string
default = "90"
}
variable "service_cpu_utilization_threshold_warning" {
description = "Warning threshold for the Service CPU Utilization monitor"
type = string
default = "80"
}
#
# Service Memory Utilization
#
variable "service_memory_utilization_enabled" {
description = "Flag to enable Service Memory Utilization monitor"
type = string
default = "false"
}
variable "service_memory_utilization_extra_tags" {
description = "Extra tags for Service Memory Utilization monitor"
type = list(string)
default = []
}
variable "service_memory_utilization_message" {
description = "Custom message for the Service Memory Utilization monitor"
type = string
default = ""
}
variable "service_memory_utilization_timeframe" {
description = "Timeframe for the Service Memory Utilization monitor"
type = string
default = "last_5m"
}
variable "service_memory_utilization_time_aggregator" {
description = "Monitor aggregator for Service Memory Utilization [available values: min, max or avg]"
type = string
default = "min"
}
variable "service_memory_utilization_threshold_critical" {
description = "Critical threshold for the Service Memory Utilization monitor"
type = string
default = 90
}
variable "service_memory_utilization_threshold_warning" {
description = "Warning threshold for the Service Memory Utilization monitor"
type = string
default = 85
}
#
# Service Missing tasks
#
variable "service_missing_tasks_enabled" {
description = "Flag to enable Service Missing Tasks monitor"
type = string
default = "true"
}
variable "service_missing_tasks_extra_tags" {
description = "Extra tags for Service Missing Tasks monitor"
type = list(string)
default = []
}
variable "service_missing_tasks_message" {
description = "Custom message for the Service Missing Tasks monitor"
type = string
default = ""
}
variable "service_missing_tasks_timeframe" {
description = "Timeframe for the Service Missing Tasks monitor"
type = string
default = "last_5m"
}
variable "service_missing_tasks_time_aggregator" {
description = "Monitor aggregator for Service Missing Tasks [available values: min, max or avg]"
type = string
default = "min"
}
variable "service_missing_tasks_threshold_critical" {
description = "Critical threshold for the Service Missing Tasks monitor"
type = string
default = 60
}
variable "service_missing_tasks_threshold_warning" {
description = "Warning threshold for the Service Missing Tasks monitor"
type = string
default = 80
}

View File

@ -0,0 +1,10 @@
module "filter-tags" {
source = "../../../../common/filter-tags"
environment = var.environment
resource = "aws_ecs"
filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = var.filter_tags_custom_excluded
}

View File

@ -0,0 +1,104 @@
# Monitors related to services
resource "datadog_monitor" "service_cpu_utilization" {
count = var.service_cpu_utilization_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.service_cpu_utilization_message, var.message)
type = "metric alert"
query = <<EOQ
${var.service_cpu_utilization_time_aggregator}(${var.service_cpu_utilization_timeframe}):
avg:aws.ecs.cpuutilization${module.filter-tags.query_alert} by {region,servicename}
> ${var.service_cpu_utilization_threshold_critical}
EOQ
thresholds = {
critical = var.service_cpu_utilization_threshold_critical
warning = var.service_cpu_utilization_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_cpu_utilization_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
resource "datadog_monitor" "service_memory_utilization" {
count = var.service_memory_utilization_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service Memory Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.service_memory_utilization_message, var.message)
type = "metric alert"
query = <<EOQ
${var.service_memory_utilization_time_aggregator}(${var.service_memory_utilization_timeframe}):
avg:aws.ecs.memory_utilization${module.filter-tags.query_alert} by {region,servicename}
> ${var.service_memory_utilization_threshold_critical}
EOQ
thresholds = {
critical = var.service_memory_utilization_threshold_critical
warning = var.service_memory_utilization_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_memory_utilization_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
resource "datadog_monitor" "service_missing_tasks" {
count = var.service_missing_tasks_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service not healthy enough {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.service_missing_tasks_message, var.message)
type = "metric alert"
query = <<EOQ
${var.service_missing_tasks_time_aggregator}(${var.service_missing_tasks_timeframe}):
avg:aws.ecs.service.running{${var.filter_tags}} by {region,servicename} / avg:aws.ecs.service.desired{${var.filter_tags}} by {region,servicename}
* 100 < ${var.service_missing_tasks_threshold_critical}
EOQ
thresholds = {
critical = var.service_missing_tasks_threshold_critical
warning = var.service_missing_tasks_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_missing_tasks_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -0,0 +1,15 @@
output "service_cpu_utilization_id" {
description = "id for monitor service_cpu_utilization"
value = datadog_monitor.service_cpu_utilization.*.id
}
output "service_memory_utilization_id" {
description = "id for monitor service_memory_utilization"
value = datadog_monitor.service_memory_utilization.*.id
}
output "service_missing_tasks_id" {
description = "id for monitor service_missing_tasks"
value = datadog_monitor.service_missing_tasks.*.id
}

View File

@ -0,0 +1,64 @@
# CLOUD AWS ECS EC2-CLUSTER DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-aws-ecs-ec2-cluster" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/ec2-cluster?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
}
```
## Purpose
Creates DataDog monitors with the following checks:
- ECS Agent disconnected
- ECS Cluster CPU Utilization High (disabled by default)
- ECS Cluster Memory Reservation High (disabled by default)
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| agent\_status\_enabled | Flag to enable Agent Status monitor | string | `"true"` | no |
| agent\_status\_extra\_tags | Extra tags for Agent Status monitor | list(string) | `[]` | no |
| agent\_status\_message | Custom message for the Agent Status monitor | string | `""` | no |
| agent\_status\_no\_data\_timeframe | Agent status does not respond monitor no data timeframe | string | `"10"` | no |
| agent\_status\_threshold\_warning | Warning threshold for the Agent Status monitor | string | `"3"` | no |
| cluster\_cpu\_utilization\_enabled | Flag to enable Cluster CPU utilization monitor | string | `"false"` | no |
| cluster\_cpu\_utilization\_extra\_tags | Extra tags for Cluster CPU utilization monitor | list(string) | `[]` | no |
| cluster\_cpu\_utilization\_message | Custom message for the Cluster CPU Utilization monitor | string | `""` | no |
| cluster\_cpu\_utilization\_threshold\_critical | Critical threshold for the Cluster CPU Utilization monitor | string | `"90"` | no |
| cluster\_cpu\_utilization\_threshold\_warning | Warning threshold for the Cluster CPU Utilization monitor | string | `"85"` | no |
| cluster\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
| cluster\_cpu\_utilization\_timeframe | Timeframe for the Cluster CPU Utilization monitor | string | `"last_5m"` | no |
| cluster\_memory\_reservation\_enabled | Flag to enable Cluster memory reservation monitor | string | `"false"` | no |
| cluster\_memory\_reservation\_extra\_tags | Extra tags for Cluster Memory Reservation monitor | list(string) | `[]` | no |
| cluster\_memory\_reservation\_message | Custom message for the Cluster Memory Reservation monitor | string | `""` | no |
| cluster\_memory\_reservation\_threshold\_critical | Critical threshold for the Cluster Memory Reservation monitor | string | `"90"` | no |
| cluster\_memory\_reservation\_threshold\_warning | Warning threshold for the Cluster Memory Reservation monitor | string | `"85"` | no |
| cluster\_memory\_reservation\_time\_aggregator | Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg] | string | `"min"` | no |
| cluster\_memory\_reservation\_timeframe | Timeframe for the Cluster Memory Reservation monitor | string | `"last_5m"` | no |
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
## Outputs
| Name | Description |
|------|-------------|
| cluster\_cpu\_utilization\_id | id for monitor cluster_cpu_utilization |
| cluster\_memory\_reservation\_id | id for monitor cluster_memory_reservation |
| ecs\_agent\_status\_id | id for monitor ecs_agent_status |
## Related documentation

View File

@ -0,0 +1,165 @@
#
# Datadog global variables
#
variable "environment" {
description = "Architecture environment"
type = string
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "prefix_slug" {
description = "Prefix string to prepend between brackets on every monitors names"
default = ""
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
#
# Agent Status
#
variable "agent_status_enabled" {
description = "Flag to enable Agent Status monitor"
type = string
default = "true"
}
variable "agent_status_extra_tags" {
description = "Extra tags for Agent Status monitor"
type = list(string)
default = []
}
variable "agent_status_message" {
description = "Custom message for the Agent Status monitor"
type = string
default = ""
}
variable "agent_status_threshold_warning" {
description = "Warning threshold for the Agent Status monitor"
type = string
default = 3
}
variable "agent_status_no_data_timeframe" {
description = "Agent status does not respond monitor no data timeframe"
type = string
default = 10
}
#
# Cluster CPU Utilization
#
variable "cluster_cpu_utilization_enabled" {
description = "Flag to enable Cluster CPU utilization monitor"
type = string
default = "false"
}
variable "cluster_cpu_utilization_extra_tags" {
description = "Extra tags for Cluster CPU utilization monitor"
type = list(string)
default = []
}
variable "cluster_cpu_utilization_message" {
description = "Custom message for the Cluster CPU Utilization monitor"
type = string
default = ""
}
variable "cluster_cpu_utilization_time_aggregator" {
description = "Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg]"
type = string
default = "min"
}
variable "cluster_cpu_utilization_timeframe" {
description = "Timeframe for the Cluster CPU Utilization monitor"
type = string
default = "last_5m"
}
variable "cluster_cpu_utilization_threshold_critical" {
description = "Critical threshold for the Cluster CPU Utilization monitor"
type = string
default = 90
}
variable "cluster_cpu_utilization_threshold_warning" {
description = "Warning threshold for the Cluster CPU Utilization monitor"
type = string
default = 85
}
#
# Cluster Memory Reservation
#
variable "cluster_memory_reservation_enabled" {
description = "Flag to enable Cluster memory reservation monitor"
type = string
default = "false"
}
variable "cluster_memory_reservation_extra_tags" {
description = "Extra tags for Cluster Memory Reservation monitor"
type = list(string)
default = []
}
variable "cluster_memory_reservation_message" {
description = "Custom message for the Cluster Memory Reservation monitor"
type = string
default = ""
}
variable "cluster_memory_reservation_time_aggregator" {
description = "Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg]"
type = string
default = "min"
}
variable "cluster_memory_reservation_timeframe" {
description = "Timeframe for the Cluster Memory Reservation monitor"
type = string
default = "last_5m"
}
variable "cluster_memory_reservation_threshold_critical" {
description = "Critical threshold for the Cluster Memory Reservation monitor"
type = string
default = 90
}
variable "cluster_memory_reservation_threshold_warning" {
description = "Warning threshold for the Cluster Memory Reservation monitor"
type = string
default = 85
}

View File

@ -0,0 +1,10 @@
module "filter-tags" {
source = "../../../../common/filter-tags"
environment = var.environment
resource = "aws_ecs"
filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = var.filter_tags_custom_excluded
}

View File

@ -0,0 +1,105 @@
# Monitors related to ECS Cluster
resource "datadog_monitor" "ecs_agent_status" {
count = var.agent_status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Agent disconnected {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.agent_status_message, var.message)
type = "service check"
query = <<EOQ
"aws.ecs.agent_connected"${module.filter-tags.service_check}.by("cluster","instance_id").last(6).count_by_status()
EOQ
thresholds = {
warning = var.agent_status_threshold_warning
critical = 5
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true
no_data_timeframe = var.agent_status_no_data_timeframe
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:agent"], var.agent_status_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
resource "datadog_monitor" "cluster_cpu_utilization" {
count = var.cluster_cpu_utilization_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.cluster_cpu_utilization_message, var.message)
type = "metric alert"
query = <<EOQ
${var.cluster_cpu_utilization_time_aggregator}(${var.cluster_cpu_utilization_timeframe}):
avg:aws.ecs.cluster.cpuutilization${module.filter-tags.query_alert} by {region,clustername}
> ${var.cluster_cpu_utilization_threshold_critical}
EOQ
thresholds = {
critical = var.cluster_cpu_utilization_threshold_critical
warning = var.cluster_cpu_utilization_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_cpu_utilization_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
resource "datadog_monitor" "cluster_memory_reservation" {
count = var.cluster_memory_reservation_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster Memory Reservation High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.cluster_memory_reservation_message, var.message)
type = "metric alert"
query = <<EOQ
${var.cluster_memory_reservation_time_aggregator}(${var.cluster_memory_reservation_timeframe}):
avg:aws.ecs.cluster.memory_reservation${module.filter-tags.query_alert} by {region,clustername}
> ${var.cluster_memory_reservation_threshold_critical}
EOQ
thresholds = {
critical = var.cluster_memory_reservation_threshold_critical
warning = var.cluster_memory_reservation_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_memory_reservation_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -0,0 +1,15 @@
output "cluster_cpu_utilization_id" {
description = "id for monitor cluster_cpu_utilization"
value = datadog_monitor.cluster_cpu_utilization.*.id
}
output "cluster_memory_reservation_id" {
description = "id for monitor cluster_memory_reservation"
value = datadog_monitor.cluster_memory_reservation.*.id
}
output "ecs_agent_status_id" {
description = "id for monitor ecs_agent_status"
value = datadog_monitor.ecs_agent_status.*.id
}