MON-32 - Free Memory monitors updated to evaluate the metric over a period

This commit is contained in:
Alexandre Gaillet 2018-06-07 11:57:36 +02:00 committed by Quentin Manfroi
parent 983bebcd5f
commit d126e898d3
11 changed files with 82 additions and 214 deletions

View File

@ -26,6 +26,7 @@ Creates DataDog monitors with the following checks:
* Swap
* Max connections
* No connection
* Free Memory
Inputs
------
@ -39,6 +40,12 @@ Inputs
| eviction_time_aggregator | Monitor aggregator for Elasticache eviction [available values: min, max or avg] | string | `min` | no |
| eviction_timeframe | Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| filter_tags | Tags used for filtering | string | - | yes |
| free_memory_condition_timeframe | Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| free_memory_message | Custom message for Elasticache free memory monitor | string | `` | no |
| free_memory_silenced | Groups to mute for Elasticache free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Elasticache free memory critical threshold in percentage | string | `300` | no |
| free_memory_threshold_warning | Elasticache free memory warning threshold in percentage | string | `200` | no |
| free_memory_timeframe | Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| max_connection_message | Custom message for Elasticache max connection monitor | string | `` | no |
| max_connection_silenced | Groups to mute for Elasticache max connection monitor | map | `<map>` | no |
| max_connection_time_aggregator | Monitor aggregator for Elasticache max connection [available values: min, max or avg] | string | `max` | no |
@ -49,12 +56,12 @@ Inputs
| no_connection_time_aggregator | Monitor aggregator for Elasticache no connection [available values: min, max or avg] | string | `min` | no |
| no_connection_timeframe | Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| resource | Type of Elasticache used | string | - | yes |
| swap_message | Custom message for Elasticache memcached swap monitor | string | `` | no |
| swap_silenced | Groups to mute for Elasticache memcached swap monitor | map | `<map>` | no |
| swap_threshold_critical | Elasticache memcached swap critical threshold in percentage | string | `50` | no |
| swap_threshold_warning | Elasticache memcached swap warning threshold in percentage | string | `0` | no |
| swap_message | Custom message for Elasticache swap monitor | string | `` | no |
| swap_silenced | Groups to mute for Elasticache swap monitor | map | `<map>` | no |
| swap_threshold_critical | Elasticache swap critical threshold in percentage | string | `50` | no |
| swap_threshold_warning | Elasticache swap warning threshold in percentage | string | `0` | no |
| swap_time_aggregator | Monitor aggregator for Elasticache memcached swap [available values: min, max or avg] | string | `min` | no |
| swap_timeframe | Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| swap_timeframe | Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
Related documentation
---------------------

View File

@ -93,13 +93,13 @@ variable "no_connection_timeframe" {
}
variable "swap_silenced" {
description = "Groups to mute for Elasticache memcached swap monitor"
description = "Groups to mute for Elasticache swap monitor"
type = "map"
default = {}
}
variable "swap_message" {
description = "Custom message for Elasticache memcached swap monitor"
description = "Custom message for Elasticache swap monitor"
type = "string"
default = ""
}
@ -111,16 +111,48 @@ variable "swap_time_aggregator" {
}
variable "swap_timeframe" {
description = "Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
description = "Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_5m"
}
variable "swap_threshold_warning" {
description = "Elasticache memcached swap warning threshold in percentage"
description = "Elasticache swap warning threshold in percentage"
default = 0
}
variable "swap_threshold_critical" {
description = "Elasticache memcached swap critical threshold in percentage"
default = 50
description = "Elasticache swap critical threshold in percentage"
default = 50000000
}
variable "free_memory_silenced" {
description = "Groups to mute for Elasticache free memory monitor"
type = "map"
default = {}
}
variable "free_memory_message" {
description = "Custom message for Elasticache free memory monitor"
type = "string"
default = ""
}
variable "free_memory_condition_timeframe" {
description = "Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_threshold_warning" {
description = "Elasticache free memory warning threshold in percentage"
default = -50
}
variable "free_memory_threshold_critical" {
description = "Elasticache free memory critical threshold in percentage"
default = -70
}

View File

@ -110,3 +110,35 @@ resource "datadog_monitor" "elasticache_swap" {
tags = ["env:${var.environment}", "engine:${var.resource}", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "redis_free_memory" {
name = "[${var.environment}] Elasticache ${var.resource} free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
type = "metric alert"
query = <<EOF
pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}):
avg:aws.elasticache.freeable_memory{${var.filter_tags}} by {region,cacheclusterid,cachenodeid}
< ${var.free_memory_threshold_critical}
EOF
thresholds {
warning = "${var.free_memory_threshold_warning}"
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
silenced = "${var.free_memory_silenced}"
tags = ["env:${var.environment}", "engine:${var.resource}", "team:aws", "provider:aws"]
}

View File

@ -21,7 +21,6 @@ Creates DataDog monitors with the following checks :
* Get Hit
* CPU High
* Free memory
Inputs
------
@ -39,12 +38,6 @@ Inputs
| environment | Infrastructure Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_memory_message | Custom message for Elasticache memcached free memory monitor | string | `` | no |
| free_memory_silenced | Groups to mute for Elasticache memcached free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Elasticache memcached free memory critical threshold in percentage | string | `5` | no |
| free_memory_threshold_warning | Elasticache memcached free memory warning threshold in percentage | string | `10` | no |
| free_memory_time_aggregator | Monitor aggregator for Elasticache memcached free memory [available values: min, max or avg] | string | `min` | no |
| free_memory_timeframe | Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| get_hits_message | Custom message for Elasticache memcached get hits monitor | string | `` | no |
| get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `<map>` | no |
| get_hits_threshold_critical | Elasticache memcached get hits critical threshold in percentage | string | `10` | no |

View File

@ -90,36 +90,3 @@ variable "cpu_high_threshold_critical" {
description = "Elasticache memcached cpu high critical threshold in percentage"
default = 90
}
variable "free_memory_silenced" {
description = "Groups to mute for Elasticache memcached free memory monitor"
type = "map"
default = {}
}
variable "free_memory_message" {
description = "Custom message for Elasticache memcached free memory monitor"
type = "string"
default = ""
}
variable "free_memory_time_aggregator" {
description = "Monitor aggregator for Elasticache memcached free memory [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_threshold_warning" {
description = "Elasticache memcached free memory warning threshold in percentage"
default = 10
}
variable "free_memory_threshold_critical" {
description = "Elasticache memcached free memory critical threshold in percentage"
default = 5
}

View File

@ -1,27 +0,0 @@
locals {
memory = {
cache.t2.micro = 595926712
cache.t2.small = 1664299827
cache.t2.medium = 3457448673
cache.m3.medium = 2985002270
cache.m3.large = 6496138035
cache.m3.xlarge = 14280766259
cache.m3.2xlarge = 29957396889
cache.m4.large = 6893422510
cache.m4.xlarge = 15333033246
cache.m4.2xlarge = 31890132172
cache.m4.4xlarge = 65262028062
cache.m4.10xlarge = 166043435663
cache.r3.large = 14495514624
cache.r3.xlarge = 30494267801
cache.r3.2xlarge = 62491774156
cache.r3.4xlarge = 126701535232
cache.r3.8xlarge = 254476812288
cache.r4.large = 13207024435
cache.r4.xlarge = 26897232691
cache.r4.2xlarge = 54191749857
cache.r4.4xlarge = 108855946117
cache.r4.8xlarge = 218248763146
cache.r4.16xlarge = 437012922368
}
}

View File

@ -80,38 +80,3 @@ resource "datadog_monitor" "memcached_cpu_high" {
tags = ["env:${var.environment}", "engine:memcached", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "memcached_free_memory" {
name = "[${var.environment}] Elasticache memcached free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
count = "${length(keys(local.memory))}"
type = "metric alert"
query = <<EOF
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
( avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_elasticache_memcached:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} * 100 ) /
${element(values(local.memory), count.index)}
) < ${var.free_memory_threshold_critical}
EOF
thresholds {
warning = "${var.free_memory_threshold_warning}"
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
silenced = "${var.free_memory_silenced}"
tags = ["env:${var.environment}", "engine:memcached", "team:aws", "provider:aws"]
}

View File

@ -33,7 +33,6 @@ Creates DataDog monitors with the following checks:
* CPU high
* Commands received
* Replication lag
* Free memory
Inputs
------
@ -59,12 +58,6 @@ Inputs
| environment | Infrastructure Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_memory_message | Custom message for Elasticache redis free memory monitor | string | `` | no |
| free_memory_silenced | Groups to mute for Elasticache redis free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Elasticache redis free memory critical threshold in percentage | string | `5` | no |
| free_memory_threshold_warning | Elasticache redis free memory warning threshold in percentage | string | `10` | no |
| free_memory_time_aggregator | Monitor aggregator for Elasticache redis free memory [available values: min, max or avg] | string | `min` | no |
| free_memory_timeframe | Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| nodes | Number of Elasticache nodes | string | - | yes |
| replication_lag_message | Custom message for Elasticache redis replication lag monitor | string | `` | no |

View File

@ -145,36 +145,3 @@ variable "commands_timeframe" {
description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_5m"
}
variable "free_memory_silenced" {
description = "Groups to mute for Elasticache redis free memory monitor"
type = "map"
default = {}
}
variable "free_memory_message" {
description = "Custom message for Elasticache redis free memory monitor"
type = "string"
default = ""
}
variable "free_memory_time_aggregator" {
description = "Monitor aggregator for Elasticache redis free memory [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_threshold_warning" {
description = "Elasticache redis free memory warning threshold in percentage"
default = 10
}
variable "free_memory_threshold_critical" {
description = "Elasticache redis free memory critical threshold in percentage"
default = 5
}

View File

@ -24,30 +24,4 @@ locals {
cache.r4.8xlarge = 32
cache.r4.16xlarge = 64
}
memory = {
cache.t2.micro = 595926712
cache.t2.small = 1664299827
cache.t2.medium = 3457448673
cache.m3.medium = 2985002270
cache.m3.large = 6496138035
cache.m3.xlarge = 14280766259
cache.m3.2xlarge = 29957396889
cache.m4.large = 6893422510
cache.m4.xlarge = 15333033246
cache.m4.2xlarge = 31890132172
cache.m4.4xlarge = 65262028062
cache.m4.10xlarge = 166043435663
cache.r3.large = 14495514624
cache.r3.xlarge = 30494267801
cache.r3.2xlarge = 62491774156
cache.r3.4xlarge = 126701535232
cache.r3.8xlarge = 254476812288
cache.r4.large = 13207024435
cache.r4.xlarge = 26897232691
cache.r4.2xlarge = 54191749857
cache.r4.4xlarge = 108855946117
cache.r4.8xlarge = 218248763146
cache.r4.16xlarge = 437012922368
}
}

View File

@ -137,38 +137,3 @@ resource "datadog_monitor" "redis_commands" {
tags = ["env:${var.environment}", "engine:redis", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "redis_free_memory" {
name = "[${var.environment}] Elasticache redis free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
count = "${length(keys(local.memory))}"
type = "metric alert"
query = <<EOF
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
( avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_elasticache_redis:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} * 100 ) /
( ${element(values(local.memory), count.index)} / ${var.nodes} )
) < ${var.free_memory_threshold_critical}
EOF
thresholds {
warning = "${var.free_memory_threshold_warning}"
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
silenced = "${var.free_memory_silenced}"
tags = ["env:${var.environment}", "engine:redis", "team:aws", "provider:aws"]
}