Add monitors related to ECS (common to fargate / EC2 cluster, and dedicated to EC2 cluster)
This commit is contained in:
parent
793d34e627
commit
f65adb8ff5
@ -140,6 +140,9 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
|
||||
- [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/)
|
||||
- [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/)
|
||||
- [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/)
|
||||
- [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/)
|
||||
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/)
|
||||
- [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/)
|
||||
- [elasticache](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/)
|
||||
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/common/)
|
||||
- [memcached](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/memcached/)
|
||||
|
||||
67
cloud/aws/ecs/common/README.md
Normal file
67
cloud/aws/ecs/common/README.md
Normal file
@ -0,0 +1,67 @@
|
||||
# CLOUD AWS ECS COMMON DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-aws-ecs-common" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/common?ref={revision}"
|
||||
|
||||
environment = var.environment
|
||||
message = module.datadog-message-alerting.alerting-message
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- ECS Service CPU Utilization High (disabled by default)
|
||||
- ECS Service Memory Utilization High (disabled by default)
|
||||
- ECS Service not healthy enough
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| filter\_tags | Tags used for filtering | string | `"*"` | no |
|
||||
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
|
||||
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
|
||||
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
|
||||
| message | Message sent when a monitor is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
|
||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||
| service\_cpu\_utilization\_enabled | Flag to enable Service CPU Utilization monitor | string | `"false"` | no |
|
||||
| service\_cpu\_utilization\_extra\_tags | Extra tags for Service CPU Utilization monitor | list(string) | `[]` | no |
|
||||
| service\_cpu\_utilization\_message | Custom message for the Service CPU Utilization monitor | string | `""` | no |
|
||||
| service\_cpu\_utilization\_threshold\_critical | Critical threshold for the Service CPU Utilization monitor | string | `"90"` | no |
|
||||
| service\_cpu\_utilization\_threshold\_warning | Warning threshold for the Service CPU Utilization monitor | string | `"80"` | no |
|
||||
| service\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Service CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
|
||||
| service\_cpu\_utilization\_timeframe | Timeframe for the Service CPU Utilization monitor | string | `"last_5m"` | no |
|
||||
| service\_memory\_utilization\_enabled | Flag to enable Service Memory Utilization monitor | string | `"false"` | no |
|
||||
| service\_memory\_utilization\_extra\_tags | Extra tags for Service Memory Utilization monitor | list(string) | `[]` | no |
|
||||
| service\_memory\_utilization\_message | Custom message for the Service Memory Utilization monitor | string | `""` | no |
|
||||
| service\_memory\_utilization\_threshold\_critical | Critical threshold for the Service Memory Utilization monitor | string | `"90"` | no |
|
||||
| service\_memory\_utilization\_threshold\_warning | Warning threshold for the Service Memory Utilization monitor | string | `"85"` | no |
|
||||
| service\_memory\_utilization\_time\_aggregator | Monitor aggregator for Service Memory Utilization [available values: min, max or avg] | string | `"min"` | no |
|
||||
| service\_memory\_utilization\_timeframe | Timeframe for the Service Memory Utilization monitor | string | `"last_5m"` | no |
|
||||
| service\_missing\_tasks\_enabled | Flag to enable Service Missing Tasks monitor | string | `"true"` | no |
|
||||
| service\_missing\_tasks\_extra\_tags | Extra tags for Service Missing Tasks monitor | list(string) | `[]` | no |
|
||||
| service\_missing\_tasks\_message | Custom message for the Service Missing Tasks monitor | string | `""` | no |
|
||||
| service\_missing\_tasks\_threshold\_critical | Critical threshold for the Service Missing Tasks monitor | string | `"60"` | no |
|
||||
| service\_missing\_tasks\_threshold\_warning | Warning threshold for the Service Missing Tasks monitor | string | `"80"` | no |
|
||||
| service\_missing\_tasks\_time\_aggregator | Monitor aggregator for Service Missing Tasks [available values: min, max or avg] | string | `"min"` | no |
|
||||
| service\_missing\_tasks\_timeframe | Timeframe for the Service Missing Tasks monitor | string | `"last_5m"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| service\_cpu\_utilization\_id | id for monitor service_cpu_utilization |
|
||||
| service\_memory\_utilization\_id | id for monitor service_memory_utilization |
|
||||
| service\_missing\_tasks\_id | id for monitor service_missing_tasks |
|
||||
|
||||
## Related documentation
|
||||
|
||||
181
cloud/aws/ecs/common/inputs.tf
Normal file
181
cloud/aws/ecs/common/inputs.tf
Normal file
@ -0,0 +1,181 @@
|
||||
#
|
||||
# Datadog global variables
|
||||
#
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "filter_tags" {
|
||||
description = "Tags used for filtering"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds before monitor new resource"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "prefix_slug" {
|
||||
description = "Prefix string to prepend between brackets on every monitors names"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom_excluded" {
|
||||
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||
default = ""
|
||||
}
|
||||
|
||||
#
|
||||
# Service CPU Utilization
|
||||
#
|
||||
variable "service_cpu_utilization_enabled" {
|
||||
description = "Flag to enable Service CPU Utilization monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_extra_tags" {
|
||||
description = "Extra tags for Service CPU Utilization monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_message" {
|
||||
description = "Custom message for the Service CPU Utilization monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_timeframe" {
|
||||
description = "Timeframe for the Service CPU Utilization monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_time_aggregator" {
|
||||
description = "Monitor aggregator for Service CPU Utilization [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_threshold_critical" {
|
||||
description = "Critical threshold for the Service CPU Utilization monitor"
|
||||
type = string
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "service_cpu_utilization_threshold_warning" {
|
||||
description = "Warning threshold for the Service CPU Utilization monitor"
|
||||
type = string
|
||||
default = "80"
|
||||
}
|
||||
|
||||
#
|
||||
# Service Memory Utilization
|
||||
#
|
||||
variable "service_memory_utilization_enabled" {
|
||||
description = "Flag to enable Service Memory Utilization monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_extra_tags" {
|
||||
description = "Extra tags for Service Memory Utilization monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_message" {
|
||||
description = "Custom message for the Service Memory Utilization monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_timeframe" {
|
||||
description = "Timeframe for the Service Memory Utilization monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_time_aggregator" {
|
||||
description = "Monitor aggregator for Service Memory Utilization [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_threshold_critical" {
|
||||
description = "Critical threshold for the Service Memory Utilization monitor"
|
||||
type = string
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "service_memory_utilization_threshold_warning" {
|
||||
description = "Warning threshold for the Service Memory Utilization monitor"
|
||||
type = string
|
||||
default = 85
|
||||
}
|
||||
|
||||
#
|
||||
# Service Missing tasks
|
||||
#
|
||||
variable "service_missing_tasks_enabled" {
|
||||
description = "Flag to enable Service Missing Tasks monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_extra_tags" {
|
||||
description = "Extra tags for Service Missing Tasks monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_message" {
|
||||
description = "Custom message for the Service Missing Tasks monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_timeframe" {
|
||||
description = "Timeframe for the Service Missing Tasks monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_time_aggregator" {
|
||||
description = "Monitor aggregator for Service Missing Tasks [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_threshold_critical" {
|
||||
description = "Critical threshold for the Service Missing Tasks monitor"
|
||||
type = string
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "service_missing_tasks_threshold_warning" {
|
||||
description = "Warning threshold for the Service Missing Tasks monitor"
|
||||
type = string
|
||||
default = 80
|
||||
}
|
||||
10
cloud/aws/ecs/common/modules.tf
Normal file
10
cloud/aws/ecs/common/modules.tf
Normal file
@ -0,0 +1,10 @@
|
||||
module "filter-tags" {
|
||||
source = "../../../../common/filter-tags"
|
||||
|
||||
environment = var.environment
|
||||
resource = "aws_ecs"
|
||||
filter_tags_use_defaults = var.filter_tags_use_defaults
|
||||
filter_tags_custom = var.filter_tags_custom
|
||||
filter_tags_custom_excluded = var.filter_tags_custom_excluded
|
||||
}
|
||||
|
||||
104
cloud/aws/ecs/common/monitors-ecs-common.tf
Normal file
104
cloud/aws/ecs/common/monitors-ecs-common.tf
Normal file
@ -0,0 +1,104 @@
|
||||
# Monitors related to services
|
||||
resource "datadog_monitor" "service_cpu_utilization" {
|
||||
count = var.service_cpu_utilization_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.service_cpu_utilization_message, var.message)
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.service_cpu_utilization_time_aggregator}(${var.service_cpu_utilization_timeframe}):
|
||||
avg:aws.ecs.cpuutilization${module.filter-tags.query_alert} by {region,servicename}
|
||||
> ${var.service_cpu_utilization_threshold_critical}
|
||||
EOQ
|
||||
|
||||
|
||||
thresholds = {
|
||||
critical = var.service_cpu_utilization_threshold_critical
|
||||
warning = var.service_cpu_utilization_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_cpu_utilization_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "service_memory_utilization" {
|
||||
count = var.service_memory_utilization_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service Memory Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.service_memory_utilization_message, var.message)
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.service_memory_utilization_time_aggregator}(${var.service_memory_utilization_timeframe}):
|
||||
avg:aws.ecs.memory_utilization${module.filter-tags.query_alert} by {region,servicename}
|
||||
> ${var.service_memory_utilization_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
critical = var.service_memory_utilization_threshold_critical
|
||||
warning = var.service_memory_utilization_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_memory_utilization_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "service_missing_tasks" {
|
||||
count = var.service_missing_tasks_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service not healthy enough {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.service_missing_tasks_message, var.message)
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.service_missing_tasks_time_aggregator}(${var.service_missing_tasks_timeframe}):
|
||||
avg:aws.ecs.service.running{${var.filter_tags}} by {region,servicename} / avg:aws.ecs.service.desired{${var.filter_tags}} by {region,servicename}
|
||||
* 100 < ${var.service_missing_tasks_threshold_critical}
|
||||
EOQ
|
||||
|
||||
|
||||
thresholds = {
|
||||
critical = var.service_missing_tasks_threshold_critical
|
||||
warning = var.service_missing_tasks_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_missing_tasks_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
15
cloud/aws/ecs/common/outputs.tf
Normal file
15
cloud/aws/ecs/common/outputs.tf
Normal file
@ -0,0 +1,15 @@
|
||||
output "service_cpu_utilization_id" {
|
||||
description = "id for monitor service_cpu_utilization"
|
||||
value = datadog_monitor.service_cpu_utilization.*.id
|
||||
}
|
||||
|
||||
output "service_memory_utilization_id" {
|
||||
description = "id for monitor service_memory_utilization"
|
||||
value = datadog_monitor.service_memory_utilization.*.id
|
||||
}
|
||||
|
||||
output "service_missing_tasks_id" {
|
||||
description = "id for monitor service_missing_tasks"
|
||||
value = datadog_monitor.service_missing_tasks.*.id
|
||||
}
|
||||
|
||||
64
cloud/aws/ecs/ec2-cluster/README.md
Normal file
64
cloud/aws/ecs/ec2-cluster/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# CLOUD AWS ECS EC2-CLUSTER DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-aws-ecs-ec2-cluster" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/ec2-cluster?ref={revision}"
|
||||
|
||||
environment = var.environment
|
||||
message = module.datadog-message-alerting.alerting-message
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- ECS Agent disconnected
|
||||
- ECS Cluster CPU Utilization High (disabled by default)
|
||||
- ECS Cluster Memory Reservation High (disabled by default)
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| agent\_status\_enabled | Flag to enable Agent Status monitor | string | `"true"` | no |
|
||||
| agent\_status\_extra\_tags | Extra tags for Agent Status monitor | list(string) | `[]` | no |
|
||||
| agent\_status\_message | Custom message for the Agent Status monitor | string | `""` | no |
|
||||
| agent\_status\_no\_data\_timeframe | Agent status does not respond monitor no data timeframe | string | `"10"` | no |
|
||||
| agent\_status\_threshold\_warning | Warning threshold for the Agent Status monitor | string | `"3"` | no |
|
||||
| cluster\_cpu\_utilization\_enabled | Flag to enable Cluster CPU utilization monitor | string | `"false"` | no |
|
||||
| cluster\_cpu\_utilization\_extra\_tags | Extra tags for Cluster CPU utilization monitor | list(string) | `[]` | no |
|
||||
| cluster\_cpu\_utilization\_message | Custom message for the Cluster CPU Utilization monitor | string | `""` | no |
|
||||
| cluster\_cpu\_utilization\_threshold\_critical | Critical threshold for the Cluster CPU Utilization monitor | string | `"90"` | no |
|
||||
| cluster\_cpu\_utilization\_threshold\_warning | Warning threshold for the Cluster CPU Utilization monitor | string | `"85"` | no |
|
||||
| cluster\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
|
||||
| cluster\_cpu\_utilization\_timeframe | Timeframe for the Cluster CPU Utilization monitor | string | `"last_5m"` | no |
|
||||
| cluster\_memory\_reservation\_enabled | Flag to enable Cluster memory reservation monitor | string | `"false"` | no |
|
||||
| cluster\_memory\_reservation\_extra\_tags | Extra tags for Cluster Memory Reservation monitor | list(string) | `[]` | no |
|
||||
| cluster\_memory\_reservation\_message | Custom message for the Cluster Memory Reservation monitor | string | `""` | no |
|
||||
| cluster\_memory\_reservation\_threshold\_critical | Critical threshold for the Cluster Memory Reservation monitor | string | `"90"` | no |
|
||||
| cluster\_memory\_reservation\_threshold\_warning | Warning threshold for the Cluster Memory Reservation monitor | string | `"85"` | no |
|
||||
| cluster\_memory\_reservation\_time\_aggregator | Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg] | string | `"min"` | no |
|
||||
| cluster\_memory\_reservation\_timeframe | Timeframe for the Cluster Memory Reservation monitor | string | `"last_5m"` | no |
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
|
||||
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
|
||||
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
|
||||
| message | Message sent when a monitor is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
|
||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| cluster\_cpu\_utilization\_id | id for monitor cluster_cpu_utilization |
|
||||
| cluster\_memory\_reservation\_id | id for monitor cluster_memory_reservation |
|
||||
| ecs\_agent\_status\_id | id for monitor ecs_agent_status |
|
||||
|
||||
## Related documentation
|
||||
|
||||
165
cloud/aws/ecs/ec2-cluster/inputs.tf
Normal file
165
cloud/aws/ecs/ec2-cluster/inputs.tf
Normal file
@ -0,0 +1,165 @@
|
||||
#
|
||||
# Datadog global variables
|
||||
#
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds before monitor new resource"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "prefix_slug" {
|
||||
description = "Prefix string to prepend between brackets on every monitors names"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom_excluded" {
|
||||
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||
default = ""
|
||||
}
|
||||
|
||||
#
|
||||
# Agent Status
|
||||
#
|
||||
variable "agent_status_enabled" {
|
||||
description = "Flag to enable Agent Status monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "agent_status_extra_tags" {
|
||||
description = "Extra tags for Agent Status monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "agent_status_message" {
|
||||
description = "Custom message for the Agent Status monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "agent_status_threshold_warning" {
|
||||
description = "Warning threshold for the Agent Status monitor"
|
||||
type = string
|
||||
default = 3
|
||||
}
|
||||
|
||||
variable "agent_status_no_data_timeframe" {
|
||||
description = "Agent status does not respond monitor no data timeframe"
|
||||
type = string
|
||||
default = 10
|
||||
}
|
||||
|
||||
#
|
||||
# Cluster CPU Utilization
|
||||
#
|
||||
variable "cluster_cpu_utilization_enabled" {
|
||||
description = "Flag to enable Cluster CPU utilization monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_extra_tags" {
|
||||
description = "Extra tags for Cluster CPU utilization monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_message" {
|
||||
description = "Custom message for the Cluster CPU Utilization monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_time_aggregator" {
|
||||
description = "Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_timeframe" {
|
||||
description = "Timeframe for the Cluster CPU Utilization monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_threshold_critical" {
|
||||
description = "Critical threshold for the Cluster CPU Utilization monitor"
|
||||
type = string
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "cluster_cpu_utilization_threshold_warning" {
|
||||
description = "Warning threshold for the Cluster CPU Utilization monitor"
|
||||
type = string
|
||||
default = 85
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Cluster Memory Reservation
|
||||
#
|
||||
variable "cluster_memory_reservation_enabled" {
|
||||
description = "Flag to enable Cluster memory reservation monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_extra_tags" {
|
||||
description = "Extra tags for Cluster Memory Reservation monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_message" {
|
||||
description = "Custom message for the Cluster Memory Reservation monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_time_aggregator" {
|
||||
description = "Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_timeframe" {
|
||||
description = "Timeframe for the Cluster Memory Reservation monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_threshold_critical" {
|
||||
description = "Critical threshold for the Cluster Memory Reservation monitor"
|
||||
type = string
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "cluster_memory_reservation_threshold_warning" {
|
||||
description = "Warning threshold for the Cluster Memory Reservation monitor"
|
||||
type = string
|
||||
default = 85
|
||||
}
|
||||
10
cloud/aws/ecs/ec2-cluster/modules.tf
Normal file
10
cloud/aws/ecs/ec2-cluster/modules.tf
Normal file
@ -0,0 +1,10 @@
|
||||
module "filter-tags" {
|
||||
source = "../../../../common/filter-tags"
|
||||
|
||||
environment = var.environment
|
||||
resource = "aws_ecs"
|
||||
filter_tags_use_defaults = var.filter_tags_use_defaults
|
||||
filter_tags_custom = var.filter_tags_custom
|
||||
filter_tags_custom_excluded = var.filter_tags_custom_excluded
|
||||
}
|
||||
|
||||
105
cloud/aws/ecs/ec2-cluster/monitors-ecs-ec2-cluster.tf
Normal file
105
cloud/aws/ecs/ec2-cluster/monitors-ecs-ec2-cluster.tf
Normal file
@ -0,0 +1,105 @@
|
||||
# Monitors related to ECS Cluster
|
||||
resource "datadog_monitor" "ecs_agent_status" {
|
||||
count = var.agent_status_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Agent disconnected {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.agent_status_message, var.message)
|
||||
type = "service check"
|
||||
|
||||
query = <<EOQ
|
||||
"aws.ecs.agent_connected"${module.filter-tags.service_check}.by("cluster","instance_id").last(6).count_by_status()
|
||||
EOQ
|
||||
|
||||
|
||||
thresholds = {
|
||||
warning = var.agent_status_threshold_warning
|
||||
critical = 5
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = true
|
||||
no_data_timeframe = var.agent_status_no_data_timeframe
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:agent"], var.agent_status_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "cluster_cpu_utilization" {
|
||||
count = var.cluster_cpu_utilization_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.cluster_cpu_utilization_message, var.message)
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.cluster_cpu_utilization_time_aggregator}(${var.cluster_cpu_utilization_timeframe}):
|
||||
avg:aws.ecs.cluster.cpuutilization${module.filter-tags.query_alert} by {region,clustername}
|
||||
> ${var.cluster_cpu_utilization_threshold_critical}
|
||||
EOQ
|
||||
|
||||
|
||||
thresholds = {
|
||||
critical = var.cluster_cpu_utilization_threshold_critical
|
||||
warning = var.cluster_cpu_utilization_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_cpu_utilization_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "cluster_memory_reservation" {
|
||||
count = var.cluster_memory_reservation_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster Memory Reservation High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = coalesce(var.cluster_memory_reservation_message, var.message)
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.cluster_memory_reservation_time_aggregator}(${var.cluster_memory_reservation_timeframe}):
|
||||
avg:aws.ecs.cluster.memory_reservation${module.filter-tags.query_alert} by {region,clustername}
|
||||
> ${var.cluster_memory_reservation_threshold_critical}
|
||||
EOQ
|
||||
|
||||
|
||||
thresholds = {
|
||||
critical = var.cluster_memory_reservation_threshold_critical
|
||||
warning = var.cluster_memory_reservation_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_memory_reservation_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
15
cloud/aws/ecs/ec2-cluster/outputs.tf
Normal file
15
cloud/aws/ecs/ec2-cluster/outputs.tf
Normal file
@ -0,0 +1,15 @@
|
||||
output "cluster_cpu_utilization_id" {
|
||||
description = "id for monitor cluster_cpu_utilization"
|
||||
value = datadog_monitor.cluster_cpu_utilization.*.id
|
||||
}
|
||||
|
||||
output "cluster_memory_reservation_id" {
|
||||
description = "id for monitor cluster_memory_reservation"
|
||||
value = datadog_monitor.cluster_memory_reservation.*.id
|
||||
}
|
||||
|
||||
output "ecs_agent_status_id" {
|
||||
description = "id for monitor ecs_agent_status"
|
||||
value = datadog_monitor.ecs_agent_status.*.id
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user