Add monitors related to ECS (common to fargate / EC2 cluster, and dedicated to EC2 cluster)

2019-10-18 18:34:53 +02:00 · 2019-10-18 18:34:53 +02:00 · f65adb8ff5
commit f65adb8ff5
parent 793d34e627
11 changed files with 739 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -140,6 +140,9 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
 	- [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/)
 		- [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/)
 		- [apigateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/apigateway/)
+		- [ecs](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/)
+			- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/common/)
+			- [ec2-cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/ecs/ec2-cluster/)
 		- [elasticache](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/)
 			- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/common/)
 			- [memcached](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticache/memcached/)
--- a/cloud/aws/ecs/common/README.md
+++ b/cloud/aws/ecs/common/README.md
@ -0,0 +1,67 @@
+# CLOUD AWS ECS COMMON DataDog monitors
+
+## How to use this module
+
+```
+module "datadog-monitors-cloud-aws-ecs-common" {
+  source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/common?ref={revision}"
+
+  environment = var.environment
+  message     = module.datadog-message-alerting.alerting-message
+}
+
+```
+
+## Purpose
+
+Creates DataDog monitors with the following checks:
+
+- ECS Service CPU Utilization High (disabled by default)
+- ECS Service Memory Utilization High (disabled by default)
+- ECS Service not healthy enough
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| environment | Architecture environment | string | n/a | yes |
+| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
+| filter\_tags | Tags used for filtering | string | `"*"` | no |
+| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
+| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
+| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
+| message | Message sent when a monitor is triggered | string | n/a | yes |
+| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
+| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
+| service\_cpu\_utilization\_enabled | Flag to enable Service CPU Utilization monitor | string | `"false"` | no |
+| service\_cpu\_utilization\_extra\_tags | Extra tags for Service CPU Utilization monitor | list(string) | `[]` | no |
+| service\_cpu\_utilization\_message | Custom message for the Service CPU Utilization monitor | string | `""` | no |
+| service\_cpu\_utilization\_threshold\_critical | Critical threshold for the Service CPU Utilization monitor | string | `"90"` | no |
+| service\_cpu\_utilization\_threshold\_warning | Warning threshold for the Service CPU Utilization monitor | string | `"80"` | no |
+| service\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Service CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
+| service\_cpu\_utilization\_timeframe | Timeframe for the Service CPU Utilization monitor | string | `"last_5m"` | no |
+| service\_memory\_utilization\_enabled | Flag to enable Service Memory Utilization monitor | string | `"false"` | no |
+| service\_memory\_utilization\_extra\_tags | Extra tags for Service Memory Utilization monitor | list(string) | `[]` | no |
+| service\_memory\_utilization\_message | Custom message for the Service Memory Utilization monitor | string | `""` | no |
+| service\_memory\_utilization\_threshold\_critical | Critical threshold for the Service Memory Utilization monitor | string | `"90"` | no |
+| service\_memory\_utilization\_threshold\_warning | Warning threshold for the Service Memory Utilization monitor | string | `"85"` | no |
+| service\_memory\_utilization\_time\_aggregator | Monitor aggregator for Service Memory Utilization [available values: min, max or avg] | string | `"min"` | no |
+| service\_memory\_utilization\_timeframe | Timeframe for the Service Memory Utilization monitor | string | `"last_5m"` | no |
+| service\_missing\_tasks\_enabled | Flag to enable Service Missing Tasks monitor | string | `"true"` | no |
+| service\_missing\_tasks\_extra\_tags | Extra tags for Service Missing Tasks monitor | list(string) | `[]` | no |
+| service\_missing\_tasks\_message | Custom message for the Service Missing Tasks monitor | string | `""` | no |
+| service\_missing\_tasks\_threshold\_critical | Critical threshold for the Service Missing Tasks monitor | string | `"60"` | no |
+| service\_missing\_tasks\_threshold\_warning | Warning threshold for the Service Missing Tasks monitor | string | `"80"` | no |
+| service\_missing\_tasks\_time\_aggregator | Monitor aggregator for Service Missing Tasks [available values: min, max or avg] | string | `"min"` | no |
+| service\_missing\_tasks\_timeframe | Timeframe for the Service Missing Tasks monitor | string | `"last_5m"` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| service\_cpu\_utilization\_id | id for monitor service_cpu_utilization |
+| service\_memory\_utilization\_id | id for monitor service_memory_utilization |
+| service\_missing\_tasks\_id | id for monitor service_missing_tasks |
+
+## Related documentation
+
--- a/cloud/aws/ecs/common/inputs.tf
+++ b/cloud/aws/ecs/common/inputs.tf
@ -0,0 +1,181 @@
+#
+# Datadog global variables
+#
+variable "environment" {
+  description = "Architecture environment"
+  type        = string
+}
+
+variable "filter_tags" {
+  description = "Tags used for filtering"
+  default     = "*"
+}
+
+variable "message" {
+  description = "Message sent when a monitor is triggered"
+}
+
+variable "evaluation_delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 900
+}
+
+variable "new_host_delay" {
+  description = "Delay in seconds before monitor new resource"
+  default     = 300
+}
+
+variable "prefix_slug" {
+  description = "Prefix string to prepend between brackets on every monitors names"
+  default     = ""
+}
+
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
+}
+
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
+variable "filter_tags_custom_excluded" {
+  description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
+  default     = ""
+}
+
+#
+# Service CPU Utilization
+#
+variable "service_cpu_utilization_enabled" {
+  description = "Flag to enable Service CPU Utilization monitor"
+  type        = string
+  default     = "false"
+}
+
+variable "service_cpu_utilization_extra_tags" {
+  description = "Extra tags for Service CPU Utilization monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "service_cpu_utilization_message" {
+  description = "Custom message for the Service CPU Utilization monitor"
+  type        = string
+  default     = ""
+}
+
+variable "service_cpu_utilization_timeframe" {
+  description = "Timeframe for the Service CPU Utilization monitor"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "service_cpu_utilization_time_aggregator" {
+  description = "Monitor aggregator for Service CPU Utilization [available values: min, max or avg]"
+  type        = string
+  default     = "min"
+}
+
+variable "service_cpu_utilization_threshold_critical" {
+  description = "Critical threshold for the Service CPU Utilization monitor"
+  type        = string
+  default     = "90"
+}
+
+variable "service_cpu_utilization_threshold_warning" {
+  description = "Warning threshold for the Service CPU Utilization monitor"
+  type        = string
+  default     = "80"
+}
+
+#
+# Service Memory Utilization
+#
+variable "service_memory_utilization_enabled" {
+  description = "Flag to enable Service Memory Utilization monitor"
+  type        = string
+  default     = "false"
+}
+
+variable "service_memory_utilization_extra_tags" {
+  description = "Extra tags for Service Memory Utilization monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "service_memory_utilization_message" {
+  description = "Custom message for the Service Memory Utilization monitor"
+  type        = string
+  default     = ""
+}
+
+variable "service_memory_utilization_timeframe" {
+  description = "Timeframe for the Service Memory Utilization monitor"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "service_memory_utilization_time_aggregator" {
+  description = "Monitor aggregator for Service Memory Utilization [available values: min, max or avg]"
+  type        = string
+  default     = "min"
+}
+
+variable "service_memory_utilization_threshold_critical" {
+  description = "Critical threshold for the Service Memory Utilization monitor"
+  type        = string
+  default     = 90
+}
+
+variable "service_memory_utilization_threshold_warning" {
+  description = "Warning threshold for the Service Memory Utilization monitor"
+  type        = string
+  default     = 85
+}
+
+#
+# Service Missing tasks
+#
+variable "service_missing_tasks_enabled" {
+  description = "Flag to enable Service Missing Tasks monitor"
+  type        = string
+  default     = "true"
+}
+
+variable "service_missing_tasks_extra_tags" {
+  description = "Extra tags for Service Missing Tasks monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "service_missing_tasks_message" {
+  description = "Custom message for the Service Missing Tasks monitor"
+  type        = string
+  default     = ""
+}
+
+variable "service_missing_tasks_timeframe" {
+  description = "Timeframe for the Service Missing Tasks monitor"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "service_missing_tasks_time_aggregator" {
+  description = "Monitor aggregator for Service Missing Tasks [available values: min, max or avg]"
+  type        = string
+  default     = "min"
+}
+
+variable "service_missing_tasks_threshold_critical" {
+  description = "Critical threshold for the Service Missing Tasks monitor"
+  type        = string
+  default     = 60
+}
+
+variable "service_missing_tasks_threshold_warning" {
+  description = "Warning threshold for the Service Missing Tasks monitor"
+  type        = string
+  default     = 80
+}
--- a/cloud/aws/ecs/common/modules.tf
+++ b/cloud/aws/ecs/common/modules.tf
@ -0,0 +1,10 @@
+module "filter-tags" {
+  source = "../../../../common/filter-tags"
+
+  environment                 = var.environment
+  resource                    = "aws_ecs"
+  filter_tags_use_defaults    = var.filter_tags_use_defaults
+  filter_tags_custom          = var.filter_tags_custom
+  filter_tags_custom_excluded = var.filter_tags_custom_excluded
+}
+
--- a/cloud/aws/ecs/common/monitors-ecs-common.tf
+++ b/cloud/aws/ecs/common/monitors-ecs-common.tf
@ -0,0 +1,104 @@
+# Monitors related to services
+resource "datadog_monitor" "service_cpu_utilization" {
+  count   = var.service_cpu_utilization_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.service_cpu_utilization_message, var.message)
+  type    = "metric alert"
+
+  query = <<EOQ
+    ${var.service_cpu_utilization_time_aggregator}(${var.service_cpu_utilization_timeframe}):
+      avg:aws.ecs.cpuutilization${module.filter-tags.query_alert} by {region,servicename}
+    > ${var.service_cpu_utilization_threshold_critical}
+EOQ
+
+
+  thresholds = {
+    critical = var.service_cpu_utilization_threshold_critical
+    warning  = var.service_cpu_utilization_threshold_warning
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = false
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_cpu_utilization_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+}
+
+resource "datadog_monitor" "service_memory_utilization" {
+  count   = var.service_memory_utilization_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service Memory Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.service_memory_utilization_message, var.message)
+  type    = "metric alert"
+
+  query = <<EOQ
+    ${var.service_memory_utilization_time_aggregator}(${var.service_memory_utilization_timeframe}):
+      avg:aws.ecs.memory_utilization${module.filter-tags.query_alert} by {region,servicename}
+    > ${var.service_memory_utilization_threshold_critical}
+EOQ
+
+  thresholds = {
+    critical = var.service_memory_utilization_threshold_critical
+    warning  = var.service_memory_utilization_threshold_warning
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = false
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_memory_utilization_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+}
+
+resource "datadog_monitor" "service_missing_tasks" {
+  count   = var.service_missing_tasks_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Service not healthy enough {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.service_missing_tasks_message, var.message)
+  type    = "metric alert"
+
+  query = <<EOQ
+  ${var.service_missing_tasks_time_aggregator}(${var.service_missing_tasks_timeframe}):
+    avg:aws.ecs.service.running{${var.filter_tags}} by {region,servicename} / avg:aws.ecs.service.desired{${var.filter_tags}} by {region,servicename}
+  * 100 < ${var.service_missing_tasks_threshold_critical}
+EOQ
+
+
+  thresholds = {
+    critical = var.service_missing_tasks_threshold_critical
+    warning  = var.service_missing_tasks_threshold_warning
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = false
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:service"], var.service_missing_tasks_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+}
--- a/cloud/aws/ecs/common/outputs.tf
+++ b/cloud/aws/ecs/common/outputs.tf
@ -0,0 +1,15 @@
+output "service_cpu_utilization_id" {
+  description = "id for monitor service_cpu_utilization"
+  value       = datadog_monitor.service_cpu_utilization.*.id
+}
+
+output "service_memory_utilization_id" {
+  description = "id for monitor service_memory_utilization"
+  value       = datadog_monitor.service_memory_utilization.*.id
+}
+
+output "service_missing_tasks_id" {
+  description = "id for monitor service_missing_tasks"
+  value       = datadog_monitor.service_missing_tasks.*.id
+}
+
--- a/cloud/aws/ecs/ec2-cluster/README.md
+++ b/cloud/aws/ecs/ec2-cluster/README.md
@ -0,0 +1,64 @@
+# CLOUD AWS ECS EC2-CLUSTER DataDog monitors
+
+## How to use this module
+
+```
+module "datadog-monitors-cloud-aws-ecs-ec2-cluster" {
+  source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/ecs/ec2-cluster?ref={revision}"
+
+  environment = var.environment
+  message     = module.datadog-message-alerting.alerting-message
+}
+
+```
+
+## Purpose
+
+Creates DataDog monitors with the following checks:
+
+- ECS Agent disconnected
+- ECS Cluster CPU Utilization High (disabled by default)
+- ECS Cluster Memory Reservation High (disabled by default)
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| agent\_status\_enabled | Flag to enable Agent Status monitor | string | `"true"` | no |
+| agent\_status\_extra\_tags | Extra tags for Agent Status monitor | list(string) | `[]` | no |
+| agent\_status\_message | Custom message for the Agent Status monitor | string | `""` | no |
+| agent\_status\_no\_data\_timeframe | Agent status does not respond monitor no data timeframe | string | `"10"` | no |
+| agent\_status\_threshold\_warning | Warning threshold for the Agent Status monitor | string | `"3"` | no |
+| cluster\_cpu\_utilization\_enabled | Flag to enable Cluster CPU utilization monitor | string | `"false"` | no |
+| cluster\_cpu\_utilization\_extra\_tags | Extra tags for Cluster CPU utilization monitor | list(string) | `[]` | no |
+| cluster\_cpu\_utilization\_message | Custom message for the Cluster CPU Utilization monitor | string | `""` | no |
+| cluster\_cpu\_utilization\_threshold\_critical | Critical threshold for the Cluster CPU Utilization monitor | string | `"90"` | no |
+| cluster\_cpu\_utilization\_threshold\_warning | Warning threshold for the Cluster CPU Utilization monitor | string | `"85"` | no |
+| cluster\_cpu\_utilization\_time\_aggregator | Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg] | string | `"min"` | no |
+| cluster\_cpu\_utilization\_timeframe | Timeframe for the Cluster CPU Utilization monitor | string | `"last_5m"` | no |
+| cluster\_memory\_reservation\_enabled | Flag to enable Cluster memory reservation monitor | string | `"false"` | no |
+| cluster\_memory\_reservation\_extra\_tags | Extra tags for Cluster Memory Reservation monitor | list(string) | `[]` | no |
+| cluster\_memory\_reservation\_message | Custom message for the Cluster Memory Reservation monitor | string | `""` | no |
+| cluster\_memory\_reservation\_threshold\_critical | Critical threshold for the Cluster Memory Reservation monitor | string | `"90"` | no |
+| cluster\_memory\_reservation\_threshold\_warning | Warning threshold for the Cluster Memory Reservation monitor | string | `"85"` | no |
+| cluster\_memory\_reservation\_time\_aggregator | Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg] | string | `"min"` | no |
+| cluster\_memory\_reservation\_timeframe | Timeframe for the Cluster Memory Reservation monitor | string | `"last_5m"` | no |
+| environment | Architecture environment | string | n/a | yes |
+| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
+| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
+| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
+| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
+| message | Message sent when a monitor is triggered | string | n/a | yes |
+| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
+| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| cluster\_cpu\_utilization\_id | id for monitor cluster_cpu_utilization |
+| cluster\_memory\_reservation\_id | id for monitor cluster_memory_reservation |
+| ecs\_agent\_status\_id | id for monitor ecs_agent_status |
+
+## Related documentation
+
--- a/cloud/aws/ecs/ec2-cluster/inputs.tf
+++ b/cloud/aws/ecs/ec2-cluster/inputs.tf
@ -0,0 +1,165 @@
+#
+# Datadog global variables
+#
+variable "environment" {
+  description = "Architecture environment"
+  type        = string
+}
+
+variable "message" {
+  description = "Message sent when a monitor is triggered"
+}
+
+variable "evaluation_delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 900
+}
+
+variable "new_host_delay" {
+  description = "Delay in seconds before monitor new resource"
+  default     = 300
+}
+
+variable "prefix_slug" {
+  description = "Prefix string to prepend between brackets on every monitors names"
+  default     = ""
+}
+
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
+}
+
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
+variable "filter_tags_custom_excluded" {
+  description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
+  default     = ""
+}
+
+#
+# Agent Status
+#
+variable "agent_status_enabled" {
+  description = "Flag to enable Agent Status monitor"
+  type        = string
+  default     = "true"
+}
+
+variable "agent_status_extra_tags" {
+  description = "Extra tags for Agent Status monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "agent_status_message" {
+  description = "Custom message for the Agent Status monitor"
+  type        = string
+  default     = ""
+}
+
+variable "agent_status_threshold_warning" {
+  description = "Warning threshold for the Agent Status monitor"
+  type        = string
+  default     = 3
+}
+
+variable "agent_status_no_data_timeframe" {
+  description = "Agent status does not respond monitor no data timeframe"
+  type        = string
+  default     = 10
+}
+
+#
+# Cluster CPU Utilization
+#
+variable "cluster_cpu_utilization_enabled" {
+  description = "Flag to enable Cluster CPU utilization monitor"
+  type        = string
+  default     = "false"
+}
+
+variable "cluster_cpu_utilization_extra_tags" {
+  description = "Extra tags for Cluster CPU utilization monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "cluster_cpu_utilization_message" {
+  description = "Custom message for the Cluster CPU Utilization monitor"
+  type        = string
+  default     = ""
+}
+
+variable "cluster_cpu_utilization_time_aggregator" {
+  description = "Monitor aggregator for Cluster CPU Utilization [available values: min, max or avg]"
+  type        = string
+  default     = "min"
+}
+
+variable "cluster_cpu_utilization_timeframe" {
+  description = "Timeframe for the Cluster CPU Utilization monitor"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "cluster_cpu_utilization_threshold_critical" {
+  description = "Critical threshold for the Cluster CPU Utilization monitor"
+  type        = string
+  default     = 90
+}
+
+variable "cluster_cpu_utilization_threshold_warning" {
+  description = "Warning threshold for the Cluster CPU Utilization monitor"
+  type        = string
+  default     = 85
+}
+
+
+#
+# Cluster Memory Reservation
+#
+variable "cluster_memory_reservation_enabled" {
+  description = "Flag to enable Cluster memory reservation monitor"
+  type        = string
+  default     = "false"
+}
+
+variable "cluster_memory_reservation_extra_tags" {
+  description = "Extra tags for Cluster Memory Reservation monitor"
+  type        = list(string)
+  default     = []
+}
+
+variable "cluster_memory_reservation_message" {
+  description = "Custom message for the Cluster Memory Reservation monitor"
+  type        = string
+  default     = ""
+}
+
+variable "cluster_memory_reservation_time_aggregator" {
+  description = "Monitor aggregator for Cluster Memory Reservation [available values: min, max or avg]"
+  type        = string
+  default     = "min"
+}
+
+variable "cluster_memory_reservation_timeframe" {
+  description = "Timeframe for the Cluster Memory Reservation monitor"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "cluster_memory_reservation_threshold_critical" {
+  description = "Critical threshold for the Cluster Memory Reservation monitor"
+  type        = string
+  default     = 90
+}
+
+variable "cluster_memory_reservation_threshold_warning" {
+  description = "Warning threshold for the Cluster Memory Reservation monitor"
+  type        = string
+  default     = 85
+}
--- a/cloud/aws/ecs/ec2-cluster/modules.tf
+++ b/cloud/aws/ecs/ec2-cluster/modules.tf
@ -0,0 +1,10 @@
+module "filter-tags" {
+  source = "../../../../common/filter-tags"
+
+  environment                 = var.environment
+  resource                    = "aws_ecs"
+  filter_tags_use_defaults    = var.filter_tags_use_defaults
+  filter_tags_custom          = var.filter_tags_custom
+  filter_tags_custom_excluded = var.filter_tags_custom_excluded
+}
+
--- a/cloud/aws/ecs/ec2-cluster/monitors-ecs-ec2-cluster.tf
+++ b/cloud/aws/ecs/ec2-cluster/monitors-ecs-ec2-cluster.tf
@ -0,0 +1,105 @@
+# Monitors related to ECS Cluster
+resource "datadog_monitor" "ecs_agent_status" {
+  count   = var.agent_status_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Agent disconnected {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.agent_status_message, var.message)
+  type    = "service check"
+
+  query = <<EOQ
+    "aws.ecs.agent_connected"${module.filter-tags.service_check}.by("cluster","instance_id").last(6).count_by_status()
+EOQ
+
+
+  thresholds = {
+    warning  = var.agent_status_threshold_warning
+    critical = 5
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = true
+  no_data_timeframe   = var.agent_status_no_data_timeframe
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:agent"], var.agent_status_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+}
+
+resource "datadog_monitor" "cluster_cpu_utilization" {
+  count   = var.cluster_cpu_utilization_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster CPU Utilization High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.cluster_cpu_utilization_message, var.message)
+  type    = "metric alert"
+
+  query = <<EOQ
+    ${var.cluster_cpu_utilization_time_aggregator}(${var.cluster_cpu_utilization_timeframe}):
+      avg:aws.ecs.cluster.cpuutilization${module.filter-tags.query_alert} by {region,clustername}
+    > ${var.cluster_cpu_utilization_threshold_critical}
+EOQ
+
+
+  thresholds = {
+    critical = var.cluster_cpu_utilization_threshold_critical
+    warning  = var.cluster_cpu_utilization_threshold_warning
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = false
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_cpu_utilization_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+
+}
+
+resource "datadog_monitor" "cluster_memory_reservation" {
+  count   = var.cluster_memory_reservation_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ECS Cluster Memory Reservation High {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  message = coalesce(var.cluster_memory_reservation_message, var.message)
+  type    = "metric alert"
+
+  query = <<EOQ
+    ${var.cluster_memory_reservation_time_aggregator}(${var.cluster_memory_reservation_timeframe}):
+      avg:aws.ecs.cluster.memory_reservation${module.filter-tags.query_alert} by {region,clustername}
+    > ${var.cluster_memory_reservation_threshold_critical}
+EOQ
+
+
+  thresholds = {
+    critical = var.cluster_memory_reservation_threshold_critical
+    warning  = var.cluster_memory_reservation_threshold_warning
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  notify_no_data      = false
+  require_full_window = false
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+
+  tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:ecs", "team:claranet", "created-by:terraform", "category:cluster"], var.cluster_memory_reservation_extra_tags)
+
+  lifecycle {
+    ignore_changes = ["silenced"]
+  }
+}
--- a/cloud/aws/ecs/ec2-cluster/outputs.tf
+++ b/cloud/aws/ecs/ec2-cluster/outputs.tf
@ -0,0 +1,15 @@
+output "cluster_cpu_utilization_id" {
+  description = "id for monitor cluster_cpu_utilization"
+  value       = datadog_monitor.cluster_cpu_utilization.*.id
+}
+
+output "cluster_memory_reservation_id" {
+  description = "id for monitor cluster_memory_reservation"
+  value       = datadog_monitor.cluster_memory_reservation.*.id
+}
+
+output "ecs_agent_status_id" {
+  description = "id for monitor ecs_agent_status"
+  value       = datadog_monitor.ecs_agent_status.*.id
+}
+