diff --git a/cloud/aws/elasticache/README.md b/cloud/aws/elasticache/README.md index 6f395a4..14963ea 100644 --- a/cloud/aws/elasticache/README.md +++ b/cloud/aws/elasticache/README.md @@ -10,6 +10,8 @@ module "datadog-monitors-aws-elasticcache" { message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" + filter_tags = "${var.filter_tags}" + resource = "${var.type_of_resource}" } ``` @@ -18,24 +20,22 @@ Purpose ------- Creates DataDog monitors with the following checks : -* CPU High +* Eviction Inputs ------ | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| cpu_aggregator | Monitor aggregator for Elasticache CPU high [available values: min, max, sum or avg] | string | `min` | no | -| cpu_message | Custom message for Elasticache CPU high monitor | string | `` | no | -| cpu_silenced | Groups to mute for Elasticache CPU high monitor | map | `` | no | -| cpu_threshold_critical | Elasticache CPU high critical threshold in percentage | string | `95` | no | -| cpu_threshold_warning | Elasticache CPU high warning threshold in percentage | string | `80` | no | -| cpu_timeframe | Monitor timeframe for Elasticache CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture Environment | string | - | yes | -| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| eviction_aggregator | Monitor aggregator for Elasticache eviction [available values: min, max, sum or avg] | string | `min` | no | +| eviction_message | Custom message for Elasticache eviction monitor | string | `` | no | +| eviction_silenced | Groups to mute for Elasticache eviction monitor | map | `` | no | +| eviction_timeframe | Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| filter_tags | Tags used for filtering | string | - | yes | | message | Message sent when an alert is triggered | string | - | yes | +| resource | Type of Elasticache used | string | - | yes | Related documentation --------------------- diff --git a/cloud/aws/elasticache/inputs.tf b/cloud/aws/elasticache/inputs.tf index 2e6aa71..5106e23 100644 --- a/cloud/aws/elasticache/inputs.tf +++ b/cloud/aws/elasticache/inputs.tf @@ -14,46 +14,34 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "filter_tags_use_defaults" { - description = "Use default filter tags convention" - default = "true" +variable "filter_tags" { + description = "Tags used for filtering" } -variable "filter_tags_custom" { - description = "Tags used for custom filtering when filter_tags_use_defaults is false" - default = "*" +variable "resource" { + description = "Type of Elasticache used" } # Elasticache specific -variable "cpu_silenced" { - description = "Groups to mute for Elasticache CPU high monitor" +variable "eviction_silenced" { + description = "Groups to mute for Elasticache eviction monitor" type = "map" default = {} } -variable "cpu_message" { - description = "Custom message for Elasticache CPU high monitor" +variable "eviction_message" { + description = "Custom message for Elasticache eviction monitor" type = "string" default = "" } -variable "cpu_aggregator" { - description = "Monitor aggregator for Elasticache CPU high [available values: min, max, sum or avg]" +variable "eviction_aggregator" { + description = "Monitor aggregator for Elasticache eviction [available values: min, max, sum or avg]" type = "string" default = "min" } -variable "cpu_timeframe" { - description = "Monitor timeframe for Elasticache CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_15m" -} - -variable "cpu_threshold_warning" { - description = "Elasticache CPU high warning threshold in percentage" - default = 80 -} - -variable "cpu_threshold_critical" { - description = "Elasticache CPU high critical threshold in percentage" - default = 95 +variable "eviction_timeframe" { + description = "Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" } diff --git a/cloud/aws/elasticache/monitors-elasticache.tf b/cloud/aws/elasticache/monitors-elasticache.tf index 798c80e..271d456 100644 --- a/cloud/aws/elasticache/monitors-elasticache.tf +++ b/cloud/aws/elasticache/monitors-elasticache.tf @@ -1,29 +1,16 @@ -data "template_file" "filter" { - template = "$${filter}" - - vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_ec:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" - } -} - -resource "datadog_monitor" "elasticache_cpu_high" { - name = "[${var.environment}] Elasticache CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" - message = "${coalesce(var.cpu_message, var.message)}" +resource "datadog_monitor" "elasticache_eviction" { + name = "[${var.environment}] Elasticache ${var.resource} eviction {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}" + message = "${coalesce(var.eviction_message, var.message)}" type = "metric alert" query = < ${var.cpu_threshold_critical} + ${var.eviction_aggregator}(${var.eviction_timeframe}): ( + ${var.eviction_aggregator}:aws.elasticache.evictions{${var.filter_tags} by {region,cluster} + ) > 0 EOF - thresholds { - warning = "${var.cpu_threshold_warning}" - critical = "${var.cpu_threshold_critical}" - } - - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 notify_audit = false @@ -33,7 +20,7 @@ resource "datadog_monitor" "elasticache_cpu_high" { require_full_window = false new_host_delay = "${var.delay}" - silenced = "${var.cpu_silenced}" + silenced = "${var.eviction_silenced}" - tags = ["env:${var.environment}", "resource:elasticache", "team:aws", "provider:aws"] + tags = ["env:${var.environment}", "resource:${var.resource}", "team:aws", "provider:aws"] } diff --git a/cloud/aws/elasticache/redis/README.md b/cloud/aws/elasticache/redis/README.md index dd09c3e..888b448 100644 --- a/cloud/aws/elasticache/redis/README.md +++ b/cloud/aws/elasticache/redis/README.md @@ -8,8 +8,9 @@ How to use this module module "datadog-monitors-aws-elasticcache-redis" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/elasticache/redis?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" + redis_size = "${var.size_of_redis}" } ``` @@ -18,28 +19,44 @@ Purpose ------- Creates DataDog monitors with the following checks : -* Cache Miss +* Cache Hit +* CPU High +* Swap Inputs ------ | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| cache_miss_aggregator | Monitor aggregator for Elasticache Redis cache miss [available values: min, max, sum or avg] | string | `min` | no | -| cache_miss_message | Custom message for Elasticache Redis cache miss monitor | string | `` | no | -| cache_miss_silenced | Groups to mute for Elasticache Redis cache miss monitor | map | `` | no | -| cache_miss_threshold_critical | Elasticache Redis cache miss critical threshold in percentage | string | `95` | no | -| cache_miss_threshold_warning | Elasticache Redis cache miss warning threshold in percentage | string | `80` | no | -| cache_miss_timeframe | Monitor timeframe for Elasticache Redis cache miss [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cache_hit_aggregator | Monitor aggregator for Elasticache Redis cache miss [available values: min, max, sum or avg] | string | `min` | no | +| cache_hit_message | Custom message for Elasticache Redis cache miss monitor | string | `` | no | +| cache_hit_silenced | Groups to mute for Elasticache Redis cache miss monitor | map | `` | no | +| cache_hit_threshold_critical | Elasticache Redis cache miss critical threshold in percentage | string | `10` | no | +| cache_hit_threshold_warning | Elasticache Redis cache miss warning threshold in percentage | string | `20` | no | +| cache_hit_timeframe | Monitor timeframe for Elasticache Redis cache miss [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cpu_high_aggregator | Monitor aggregator for Elasticache Redis cpu high [available values: min, max, sum or avg] | string | `min` | no | +| cpu_high_message | Custom message for Elasticache Redis cpu high monitor | string | `` | no | +| cpu_high_silenced | Groups to mute for Elasticache Redis cpu high monitor | map | `` | no | +| cpu_high_threshold_critical | Elasticache Redis cpu high critical threshold in percentage | string | `90` | no | +| cpu_high_threshold_warning | Elasticache Redis cpu high warning threshold in percentage | string | `75` | no | +| cpu_high_timeframe | Monitor timeframe for Elasticache Redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture Environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | +| redis_size | Size of the Elasticache Redis instance | string | - | yes | +| swap_aggregator | Monitor aggregator for Elasticache Redis swap [available values: min, max, sum or avg] | string | `min` | no | +| swap_message | Custom message for Elasticache Redis swap monitor | string | `` | no | +| swap_silenced | Groups to mute for Elasticache Redis swap monitor | map | `` | no | +| swap_timeframe | Monitor timeframe for Elasticache Redis swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | Related documentation --------------------- DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_elasticache/](https://docs.datadoghq.com/integrations/amazon_elasticache/) +And more here : +* [https://www.datadoghq.com/dashboards/elasticache-dashboard-redis/](https://www.datadoghq.com/dashboards/elasticache-dashboard-redis/) +* [https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/](https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/) AWS ElasticSearch Service Instance metrics documentation: [https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/elasticache-metricscollected.html](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/elasticache-metricscollected.html) diff --git a/cloud/aws/elasticache/redis/core.tf b/cloud/aws/elasticache/redis/core.tf new file mode 100644 index 0000000..988ba69 --- /dev/null +++ b/cloud/aws/elasticache/redis/core.tf @@ -0,0 +1,31 @@ +variable "core" { + type = "map" + + description = "Mapping between Redis size and vCPU." + + default = { + cache.t2.micro = "1" + cache.t2.small = "1" + cache.t2.medium = "2" + cache.m3.medium = "1" + cache.m3.large = "2" + cache.m3.xlarge = "4" + cache.m3.2xlarge = "8" + cache.m4.large = "2" + cache.m4.xlarge = "4" + cache.m4.2xlarge = "8" + cache.m4.4xlarge = "16" + cache.m4.10xlarge = "40" + cache.r3.large = "2" + cache.r3.xlarge = "4" + cache.r3.2xlarge = "8" + cache.r3.4xlarge = "16" + cache.r3.8xlarge = "32" + cache.r4.large = "2" + cache.r4.xlarge = "4" + cache.r4.2xlarge = "8" + cache.r4.4xlarge = "16" + cache.r4.8xlarge = "32" + cache.r4.16xlarge = "64" + } +} diff --git a/cloud/aws/elasticache/redis/inputs.tf b/cloud/aws/elasticache/redis/inputs.tf index 4ca8823..5c963f2 100644 --- a/cloud/aws/elasticache/redis/inputs.tf +++ b/cloud/aws/elasticache/redis/inputs.tf @@ -25,35 +25,96 @@ variable "filter_tags_custom" { } # Redis specific -variable "cache_miss_silenced" { +variable "redis_size" { + description = "Size of the Elasticache Redis instance" + type = "string" +} + +variable "cache_hit_silenced" { description = "Groups to mute for Elasticache Redis cache miss monitor" type = "map" default = {} } -variable "cache_miss_message" { +variable "cache_hit_message" { description = "Custom message for Elasticache Redis cache miss monitor" type = "string" default = "" } -variable "cache_miss_aggregator" { +variable "cache_hit_aggregator" { description = "Monitor aggregator for Elasticache Redis cache miss [available values: min, max, sum or avg]" type = "string" default = "min" } -variable "cache_miss_timeframe" { +variable "cache_hit_timeframe" { description = "Monitor timeframe for Elasticache Redis cache miss [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_15m" } -variable "cache_miss_threshold_warning" { +variable "cache_hit_threshold_warning" { description = "Elasticache Redis cache miss warning threshold in percentage" - default = 80 + default = 20 } -variable "cache_miss_threshold_critical" { +variable "cache_hit_threshold_critical" { description = "Elasticache Redis cache miss critical threshold in percentage" - default = 95 + default = 10 +} + +variable "cpu_high_silenced" { + description = "Groups to mute for Elasticache Redis cpu high monitor" + type = "map" + default = {} +} + +variable "cpu_high_message" { + description = "Custom message for Elasticache Redis cpu high monitor" + type = "string" + default = "" +} + +variable "cpu_high_aggregator" { + description = "Monitor aggregator for Elasticache Redis cpu high [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "cpu_high_timeframe" { + description = "Monitor timeframe for Elasticache Redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_15m" +} + +variable "cpu_high_threshold_warning" { + description = "Elasticache Redis cpu high warning threshold in percentage" + default = 75 +} + +variable "cpu_high_threshold_critical" { + description = "Elasticache Redis cpu high critical threshold in percentage" + default = 90 +} + +variable "swap_silenced" { + description = "Groups to mute for Elasticache Redis swap monitor" + type = "map" + default = {} +} + +variable "swap_message" { + description = "Custom message for Elasticache Redis swap monitor" + type = "string" + default = "" +} + +variable "swap_aggregator" { + description = "Monitor aggregator for Elasticache Redis swap [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "swap_timeframe" { + description = "Monitor timeframe for Elasticache Redis swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_15m" } diff --git a/cloud/aws/elasticache/redis/monitors-redis.tf b/cloud/aws/elasticache/redis/monitors-redis.tf index 3dbacc0..4b8100d 100644 --- a/cloud/aws/elasticache/redis/monitors-redis.tf +++ b/cloud/aws/elasticache/redis/monitors-redis.tf @@ -6,30 +6,32 @@ data "template_file" "filter" { } } -module "datadog-monitors-aws-elasticcache" { +module "datadog-monitors-aws-elasticcache-common" { source = "../." message = "${var.message}" environment = "${var.environment}" + filter_tags = "${data.template_file.filter.rendered}" + resource = "redis" } -resource "datadog_monitor" "redis_cache_miss" { - name = "[${var.environment}] Elasticache Redis cache miss {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" - message = "${coalesce(var.cache_miss_message, var.message)}" +resource "datadog_monitor" "redis_cache_hit" { + name = "[${var.environment}] Elasticache redis cache hit {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cache_hit_message, var.message)}" type = "metric alert" query = < ${var.cache_miss_threshold_critical} + ${var.cache_hit_aggregator}(${var.cache_hit_timeframe}): ( + ${var.cache_hit_aggregator}:aws.elasticache.cache_hits{${data.template_file.filter.rendered}} by {region,cluster} / + (${var.cache_hit_aggregator}:aws.elasticache.cache_hits{${data.template_file.filter.rendered}} by {region,cluster} + + ${var.cache_hit_aggregator}:aws.elasticache.cache_misses{${data.template_file.filter.rendered}} by {region,cluster}) + ) < ${var.cache_hit_threshold_critical} EOF thresholds { - warning = "${var.cache_miss_threshold_warning}" - critical = "${var.cache_miss_threshold_critical}" + warning = "${var.cache_hit_threshold_warning}" + critical = "${var.cache_hit_threshold_critical}" } notify_no_data = true @@ -42,7 +44,66 @@ resource "datadog_monitor" "redis_cache_miss" { require_full_window = false new_host_delay = "${var.delay}" - silenced = "${var.cache_miss_silenced}" + silenced = "${var.cache_hit_silenced}" + + tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "redis_cpu_high" { + name = "[${var.environment}] Elasticache redis CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_high_message, var.message)}" + + type = "metric alert" + + query = < ( ${var.cpu_high_threshold_critical} / ${var.core[var.redis_size]} ) + EOF + + thresholds { + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.delay}" + + silenced = "${var.cpu_high_silenced}" + + tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "redis_swap" { + name = "[${var.environment}] Elasticache redis is starting to swap ({{value}}MB)" + message = "${coalesce(var.swap_message, var.message)}" + + type = "metric alert" + + query = < 0 + EOF + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.delay}" + + silenced = "${var.swap_silenced}" tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"] }