diff --git a/cloud/aws/elasticache/common/README.md b/cloud/aws/elasticache/common/README.md index 56c77d3..802c0c1 100644 --- a/cloud/aws/elasticache/common/README.md +++ b/cloud/aws/elasticache/common/README.md @@ -16,13 +16,15 @@ module "datadog-monitors-aws-elasticache" { ``` -This module is used by default when you define `memcached` or `redis` monitors +This module is loaded by default when you define `memcached` or `redis` monitors Purpose ------- Creates DataDog monitors with the following checks: * Eviction +* Max connections +* No connection Inputs ------ @@ -36,7 +38,15 @@ Inputs | eviction_silenced | Groups to mute for Elasticache eviction monitor | map | `` | no | | eviction_timeframe | Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | filter_tags | Tags used for filtering | string | - | yes | +| max_connection_aggregator | Monitor aggregator for Elasticache max connection [available values: min, max, sum or avg] | string | `min` | no | +| max_connection_message | Custom message for Elasticache max connection monitor | string | `` | no | +| max_connection_silenced | Groups to mute for Elasticache max connection monitor | map | `` | no | +| max_connection_timeframe | Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | message | Message sent when an alert is triggered | string | - | yes | +| no_connection_aggregator | Monitor aggregator for Elasticache no connection [available values: min, max, sum or avg] | string | `min` | no | +| no_connection_message | Custom message for Elasticache no connection monitor | string | `` | no | +| no_connection_silenced | Groups to mute for Elasticache no connection monitor | map | `` | no | +| no_connection_timeframe | Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | resource | Type of Elasticache used | string | - | yes | Related documentation diff --git a/cloud/aws/elasticache/common/inputs.tf b/cloud/aws/elasticache/common/inputs.tf index 5106e23..2e4fe15 100644 --- a/cloud/aws/elasticache/common/inputs.tf +++ b/cloud/aws/elasticache/common/inputs.tf @@ -45,3 +45,49 @@ variable "eviction_timeframe" { description = "Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_5m" } + +variable "max_connection_silenced" { + description = "Groups to mute for Elasticache max connection monitor" + type = "map" + default = {} +} + +variable "max_connection_message" { + description = "Custom message for Elasticache max connection monitor" + type = "string" + default = "" +} + +variable "max_connection_aggregator" { + description = "Monitor aggregator for Elasticache max connection [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "max_connection_timeframe" { + description = "Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" +} + +variable "no_connection_silenced" { + description = "Groups to mute for Elasticache no connection monitor" + type = "map" + default = {} +} + +variable "no_connection_message" { + description = "Custom message for Elasticache no connection monitor" + type = "string" + default = "" +} + +variable "no_connection_aggregator" { + description = "Monitor aggregator for Elasticache no connection [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "no_connection_timeframe" { + description = "Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" +} diff --git a/cloud/aws/elasticache/common/monitors-elasticache.tf b/cloud/aws/elasticache/common/monitors-elasticache.tf index da13264..98adc63 100644 --- a/cloud/aws/elasticache/common/monitors-elasticache.tf +++ b/cloud/aws/elasticache/common/monitors-elasticache.tf @@ -24,3 +24,57 @@ resource "datadog_monitor" "elasticache_eviction" { tags = ["env:${var.environment}", "resource:${var.resource}", "team:aws", "provider:aws"] } + +resource "datadog_monitor" "elasticache_max_connection" { + name = "[${var.environment}] Elasticache ${var.resource} connections {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}" + message = "${coalesce(var.max_connection_message, var.message)}" + + type = "metric alert" + + query = <= 65000 + EOF + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.delay}" + + silenced = "${var.max_connection_silenced}" + + tags = ["env:${var.environment}", "resource:${var.resource}", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "elasticache_no_connection" { + name = "[${var.environment}] Elasticache ${var.resource} connections {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}" + message = "${coalesce(var.no_connection_message, var.message)}" + + type = "metric alert" + + query = <` | no | +| commands_timeframe | Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | cpu_high_aggregator | Monitor aggregator for Elasticache redis cpu high [available values: min, max, sum or avg] | string | `min` | no | | cpu_high_message | Custom message for Elasticache redis cpu high monitor | string | `` | no | | cpu_high_silenced | Groups to mute for Elasticache redis cpu high monitor | map | `` | no | @@ -46,17 +52,23 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | | redis_size | Size of the Elasticache redis instance | string | - | yes | +| replication_lag_aggregator | Monitor aggregator for Elasticache redis replication lag [available values: min, max, sum or avg] | string | `min` | no | +| replication_lag_message | Custom message for Elasticache redis replication lag monitor | string | `` | no | +| replication_lag_silenced | Groups to mute for Elasticache redis replication lag monitor | map | `` | no | +| replication_lag_threshold_critical | Elasticache redis replication lag critical threshold in seconds | string | `1` | no | +| replication_lag_threshold_warning | Elasticache redis replication lag warning threshold in seconds | string | `0` | no | +| replication_lag_timeframe | Monitor timeframe for Elasticache redis replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | swap_aggregator | Monitor aggregator for Elasticache redis swap [available values: min, max, sum or avg] | string | `min` | no | | swap_message | Custom message for Elasticache redis swap monitor | string | `` | no | | swap_silenced | Groups to mute for Elasticache redis swap monitor | map | `` | no | -| swap_timeframe | Monitor timeframe for Elasticache redis swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| swap_timeframe | Monitor timeframe for Elasticache redis swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | Related documentation --------------------- -DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_elasticache/](https://docs.datadoghq.com/integrations/amazon_elasticache/) -And more here: +DataDog documentation: +* [https://docs.datadoghq.com/integrations/amazon_elasticache/](https://docs.datadoghq.com/integrations/amazon_elasticache/) * [https://www.datadoghq.com/dashboards/elasticache-dashboard-redis/](https://www.datadoghq.com/dashboards/elasticache-dashboard-redis/) * [https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/](https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/) diff --git a/cloud/aws/elasticache/redis/inputs.tf b/cloud/aws/elasticache/redis/inputs.tf index 39a03da..0b063e4 100644 --- a/cloud/aws/elasticache/redis/inputs.tf +++ b/cloud/aws/elasticache/redis/inputs.tf @@ -116,5 +116,61 @@ variable "swap_aggregator" { variable "swap_timeframe" { description = "Monitor timeframe for Elasticache redis swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_15m" + default = "last_5m" +} + +variable "replication_lag_silenced" { + description = "Groups to mute for Elasticache redis replication lag monitor" + type = "map" + default = {} +} + +variable "replication_lag_message" { + description = "Custom message for Elasticache redis replication lag monitor" + type = "string" + default = "" +} + +variable "replication_lag_aggregator" { + description = "Monitor aggregator for Elasticache redis replication lag [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "replication_lag_timeframe" { + description = "Monitor timeframe for Elasticache redis replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" +} + +variable "replication_lag_threshold_warning" { + description = "Elasticache redis replication lag warning threshold in seconds" + default = 0 +} + +variable "replication_lag_threshold_critical" { + description = "Elasticache redis replication lag critical threshold in seconds" + default = 1 +} + +variable "commands_silenced" { + description = "Groups to mute for Elasticache redis commands monitor" + type = "map" + default = {} +} + +variable "commands_message" { + description = "Custom message for Elasticache redis commands monitor" + type = "string" + default = "" +} + +variable "commands_aggregator" { + description = "Monitor aggregator for Elasticache redis commands [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "commands_timeframe" { + description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" } diff --git a/cloud/aws/elasticache/redis/monitors-redis.tf b/cloud/aws/elasticache/redis/monitors-redis.tf index 9a7a47f..f951958 100644 --- a/cloud/aws/elasticache/redis/monitors-redis.tf +++ b/cloud/aws/elasticache/redis/monitors-redis.tf @@ -107,3 +107,63 @@ resource "datadog_monitor" "redis_swap" { tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"] } + +resource "datadog_monitor" "redis_replication_lag" { + name = "[${var.environment}] Elasticache redis replication lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" + message = "${coalesce(var.replication_lag_message, var.message)}" + + type = "metric alert" + + query = < ${var.replication_lag_threshold_critical} + EOF + + thresholds { + warning = "${var.replication_lag_threshold_warning}" + critical = "${var.replication_lag_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.delay}" + + silenced = "${var.replication_lag_silenced}" + + tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "redis_commands" { + name = "[${var.environment}] Elasticache redis is receiving no commands" + message = "${coalesce(var.commands_message, var.message)}" + + type = "metric alert" + + query = <