MON-32 - Free Memory monitors updated to evaluate the metric over a period
This commit is contained in:
parent
983bebcd5f
commit
d126e898d3
@ -26,6 +26,7 @@ Creates DataDog monitors with the following checks:
|
||||
* Swap
|
||||
* Max connections
|
||||
* No connection
|
||||
* Free Memory
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -39,6 +40,12 @@ Inputs
|
||||
| eviction_time_aggregator | Monitor aggregator for Elasticache eviction [available values: min, max or avg] | string | `min` | no |
|
||||
| eviction_timeframe | Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
| filter_tags | Tags used for filtering | string | - | yes |
|
||||
| free_memory_condition_timeframe | Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||
| free_memory_message | Custom message for Elasticache free memory monitor | string | `` | no |
|
||||
| free_memory_silenced | Groups to mute for Elasticache free memory monitor | map | `<map>` | no |
|
||||
| free_memory_threshold_critical | Elasticache free memory critical threshold in percentage | string | `300` | no |
|
||||
| free_memory_threshold_warning | Elasticache free memory warning threshold in percentage | string | `200` | no |
|
||||
| free_memory_timeframe | Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||
| max_connection_message | Custom message for Elasticache max connection monitor | string | `` | no |
|
||||
| max_connection_silenced | Groups to mute for Elasticache max connection monitor | map | `<map>` | no |
|
||||
| max_connection_time_aggregator | Monitor aggregator for Elasticache max connection [available values: min, max or avg] | string | `max` | no |
|
||||
@ -49,12 +56,12 @@ Inputs
|
||||
| no_connection_time_aggregator | Monitor aggregator for Elasticache no connection [available values: min, max or avg] | string | `min` | no |
|
||||
| no_connection_timeframe | Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
| resource | Type of Elasticache used | string | - | yes |
|
||||
| swap_message | Custom message for Elasticache memcached swap monitor | string | `` | no |
|
||||
| swap_silenced | Groups to mute for Elasticache memcached swap monitor | map | `<map>` | no |
|
||||
| swap_threshold_critical | Elasticache memcached swap critical threshold in percentage | string | `50` | no |
|
||||
| swap_threshold_warning | Elasticache memcached swap warning threshold in percentage | string | `0` | no |
|
||||
| swap_message | Custom message for Elasticache swap monitor | string | `` | no |
|
||||
| swap_silenced | Groups to mute for Elasticache swap monitor | map | `<map>` | no |
|
||||
| swap_threshold_critical | Elasticache swap critical threshold in percentage | string | `50` | no |
|
||||
| swap_threshold_warning | Elasticache swap warning threshold in percentage | string | `0` | no |
|
||||
| swap_time_aggregator | Monitor aggregator for Elasticache memcached swap [available values: min, max or avg] | string | `min` | no |
|
||||
| swap_timeframe | Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
| swap_timeframe | Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -93,13 +93,13 @@ variable "no_connection_timeframe" {
|
||||
}
|
||||
|
||||
variable "swap_silenced" {
|
||||
description = "Groups to mute for Elasticache memcached swap monitor"
|
||||
description = "Groups to mute for Elasticache swap monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "swap_message" {
|
||||
description = "Custom message for Elasticache memcached swap monitor"
|
||||
description = "Custom message for Elasticache swap monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
@ -111,16 +111,48 @@ variable "swap_time_aggregator" {
|
||||
}
|
||||
|
||||
variable "swap_timeframe" {
|
||||
description = "Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
description = "Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "swap_threshold_warning" {
|
||||
description = "Elasticache memcached swap warning threshold in percentage"
|
||||
description = "Elasticache swap warning threshold in percentage"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "swap_threshold_critical" {
|
||||
description = "Elasticache memcached swap critical threshold in percentage"
|
||||
default = 50
|
||||
description = "Elasticache swap critical threshold in percentage"
|
||||
default = 50000000
|
||||
}
|
||||
|
||||
variable "free_memory_silenced" {
|
||||
description = "Groups to mute for Elasticache free memory monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "free_memory_message" {
|
||||
description = "Custom message for Elasticache free memory monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "free_memory_condition_timeframe" {
|
||||
description = "Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "free_memory_timeframe" {
|
||||
description = "Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_warning" {
|
||||
description = "Elasticache free memory warning threshold in percentage"
|
||||
default = -50
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_critical" {
|
||||
description = "Elasticache free memory critical threshold in percentage"
|
||||
default = -70
|
||||
}
|
||||
|
||||
@ -110,3 +110,35 @@ resource "datadog_monitor" "elasticache_swap" {
|
||||
|
||||
tags = ["env:${var.environment}", "engine:${var.resource}", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "redis_free_memory" {
|
||||
name = "[${var.environment}] Elasticache ${var.resource} free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_memory_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}):
|
||||
avg:aws.elasticache.freeable_memory{${var.filter_tags}} by {region,cacheclusterid,cachenodeid}
|
||||
< ${var.free_memory_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.free_memory_threshold_warning}"
|
||||
critical = "${var.free_memory_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
|
||||
silenced = "${var.free_memory_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "engine:${var.resource}", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
@ -21,7 +21,6 @@ Creates DataDog monitors with the following checks :
|
||||
|
||||
* Get Hit
|
||||
* CPU High
|
||||
* Free memory
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -39,12 +38,6 @@ Inputs
|
||||
| environment | Infrastructure Environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| free_memory_message | Custom message for Elasticache memcached free memory monitor | string | `` | no |
|
||||
| free_memory_silenced | Groups to mute for Elasticache memcached free memory monitor | map | `<map>` | no |
|
||||
| free_memory_threshold_critical | Elasticache memcached free memory critical threshold in percentage | string | `5` | no |
|
||||
| free_memory_threshold_warning | Elasticache memcached free memory warning threshold in percentage | string | `10` | no |
|
||||
| free_memory_time_aggregator | Monitor aggregator for Elasticache memcached free memory [available values: min, max or avg] | string | `min` | no |
|
||||
| free_memory_timeframe | Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||
| get_hits_message | Custom message for Elasticache memcached get hits monitor | string | `` | no |
|
||||
| get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `<map>` | no |
|
||||
| get_hits_threshold_critical | Elasticache memcached get hits critical threshold in percentage | string | `10` | no |
|
||||
|
||||
@ -90,36 +90,3 @@ variable "cpu_high_threshold_critical" {
|
||||
description = "Elasticache memcached cpu high critical threshold in percentage"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "free_memory_silenced" {
|
||||
description = "Groups to mute for Elasticache memcached free memory monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "free_memory_message" {
|
||||
description = "Custom message for Elasticache memcached free memory monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "free_memory_time_aggregator" {
|
||||
description = "Monitor aggregator for Elasticache memcached free memory [available values: min, max or avg]"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "free_memory_timeframe" {
|
||||
description = "Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_warning" {
|
||||
description = "Elasticache memcached free memory warning threshold in percentage"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_critical" {
|
||||
description = "Elasticache memcached free memory critical threshold in percentage"
|
||||
default = 5
|
||||
}
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
locals {
|
||||
memory = {
|
||||
cache.t2.micro = 595926712
|
||||
cache.t2.small = 1664299827
|
||||
cache.t2.medium = 3457448673
|
||||
cache.m3.medium = 2985002270
|
||||
cache.m3.large = 6496138035
|
||||
cache.m3.xlarge = 14280766259
|
||||
cache.m3.2xlarge = 29957396889
|
||||
cache.m4.large = 6893422510
|
||||
cache.m4.xlarge = 15333033246
|
||||
cache.m4.2xlarge = 31890132172
|
||||
cache.m4.4xlarge = 65262028062
|
||||
cache.m4.10xlarge = 166043435663
|
||||
cache.r3.large = 14495514624
|
||||
cache.r3.xlarge = 30494267801
|
||||
cache.r3.2xlarge = 62491774156
|
||||
cache.r3.4xlarge = 126701535232
|
||||
cache.r3.8xlarge = 254476812288
|
||||
cache.r4.large = 13207024435
|
||||
cache.r4.xlarge = 26897232691
|
||||
cache.r4.2xlarge = 54191749857
|
||||
cache.r4.4xlarge = 108855946117
|
||||
cache.r4.8xlarge = 218248763146
|
||||
cache.r4.16xlarge = 437012922368
|
||||
}
|
||||
}
|
||||
@ -80,38 +80,3 @@ resource "datadog_monitor" "memcached_cpu_high" {
|
||||
|
||||
tags = ["env:${var.environment}", "engine:memcached", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "memcached_free_memory" {
|
||||
name = "[${var.environment}] Elasticache memcached free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_memory_message, var.message)}"
|
||||
|
||||
count = "${length(keys(local.memory))}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
|
||||
( avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_elasticache_memcached:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} * 100 ) /
|
||||
${element(values(local.memory), count.index)}
|
||||
) < ${var.free_memory_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.free_memory_threshold_warning}"
|
||||
critical = "${var.free_memory_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
|
||||
silenced = "${var.free_memory_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "engine:memcached", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
@ -33,7 +33,6 @@ Creates DataDog monitors with the following checks:
|
||||
* CPU high
|
||||
* Commands received
|
||||
* Replication lag
|
||||
* Free memory
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -59,12 +58,6 @@ Inputs
|
||||
| environment | Infrastructure Environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| free_memory_message | Custom message for Elasticache redis free memory monitor | string | `` | no |
|
||||
| free_memory_silenced | Groups to mute for Elasticache redis free memory monitor | map | `<map>` | no |
|
||||
| free_memory_threshold_critical | Elasticache redis free memory critical threshold in percentage | string | `5` | no |
|
||||
| free_memory_threshold_warning | Elasticache redis free memory warning threshold in percentage | string | `10` | no |
|
||||
| free_memory_time_aggregator | Monitor aggregator for Elasticache redis free memory [available values: min, max or avg] | string | `min` | no |
|
||||
| free_memory_timeframe | Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| nodes | Number of Elasticache nodes | string | - | yes |
|
||||
| replication_lag_message | Custom message for Elasticache redis replication lag monitor | string | `` | no |
|
||||
|
||||
@ -145,36 +145,3 @@ variable "commands_timeframe" {
|
||||
description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "free_memory_silenced" {
|
||||
description = "Groups to mute for Elasticache redis free memory monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "free_memory_message" {
|
||||
description = "Custom message for Elasticache redis free memory monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "free_memory_time_aggregator" {
|
||||
description = "Monitor aggregator for Elasticache redis free memory [available values: min, max or avg]"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "free_memory_timeframe" {
|
||||
description = "Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_warning" {
|
||||
description = "Elasticache redis free memory warning threshold in percentage"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "free_memory_threshold_critical" {
|
||||
description = "Elasticache redis free memory critical threshold in percentage"
|
||||
default = 5
|
||||
}
|
||||
|
||||
@ -24,30 +24,4 @@ locals {
|
||||
cache.r4.8xlarge = 32
|
||||
cache.r4.16xlarge = 64
|
||||
}
|
||||
|
||||
memory = {
|
||||
cache.t2.micro = 595926712
|
||||
cache.t2.small = 1664299827
|
||||
cache.t2.medium = 3457448673
|
||||
cache.m3.medium = 2985002270
|
||||
cache.m3.large = 6496138035
|
||||
cache.m3.xlarge = 14280766259
|
||||
cache.m3.2xlarge = 29957396889
|
||||
cache.m4.large = 6893422510
|
||||
cache.m4.xlarge = 15333033246
|
||||
cache.m4.2xlarge = 31890132172
|
||||
cache.m4.4xlarge = 65262028062
|
||||
cache.m4.10xlarge = 166043435663
|
||||
cache.r3.large = 14495514624
|
||||
cache.r3.xlarge = 30494267801
|
||||
cache.r3.2xlarge = 62491774156
|
||||
cache.r3.4xlarge = 126701535232
|
||||
cache.r3.8xlarge = 254476812288
|
||||
cache.r4.large = 13207024435
|
||||
cache.r4.xlarge = 26897232691
|
||||
cache.r4.2xlarge = 54191749857
|
||||
cache.r4.4xlarge = 108855946117
|
||||
cache.r4.8xlarge = 218248763146
|
||||
cache.r4.16xlarge = 437012922368
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,38 +137,3 @@ resource "datadog_monitor" "redis_commands" {
|
||||
|
||||
tags = ["env:${var.environment}", "engine:redis", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "redis_free_memory" {
|
||||
name = "[${var.environment}] Elasticache redis free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_memory_message, var.message)}"
|
||||
|
||||
count = "${length(keys(local.memory))}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
|
||||
( avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_elasticache_redis:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} * 100 ) /
|
||||
( ${element(values(local.memory), count.index)} / ${var.nodes} )
|
||||
) < ${var.free_memory_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.free_memory_threshold_warning}"
|
||||
critical = "${var.free_memory_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
|
||||
silenced = "${var.free_memory_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "engine:redis", "team:aws", "provider:aws"]
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user