MON-32 - Memory usage monitor updated

This commit is contained in:
Alexandre Gaillet 2018-05-26 02:41:37 +02:00 committed by Quentin Manfroi
parent 299beab1a4
commit 9c4f5b4cfd
5 changed files with 39 additions and 35 deletions

View File

@ -61,7 +61,7 @@ variable "max_connection_message" {
variable "max_connection_time_aggregator" {
description = "Monitor aggregator for Elasticache max connection [available values: min, max or avg]"
type = "string"
default = "min"
default = "max"
}
variable "max_connection_timeframe" {

View File

@ -1,27 +1,27 @@
locals {
memory = {
cache.t2.micro = "595926712"
cache.t2.small = "1664299827"
cache.t2.medium = "3457448673"
cache.m3.medium = "2985002270"
cache.m3.large = "6496138035"
cache.m3.xlarge = "14280766259"
cache.m3.2xlarge = "29957396889"
cache.m4.large = "6893422510"
cache.m4.xlarge = "15333033246"
cache.m4.2xlarge = "31890132172"
cache.m4.4xlarge = "65262028062"
cache.m4.10xlarge = "166043435663"
cache.r3.large = "14495514624"
cache.r3.xlarge = "30494267801"
cache.r3.2xlarge = "62491774156"
cache.r3.4xlarge = "126701535232"
cache.r3.8xlarge = "254476812288"
cache.r4.large = "13207024435"
cache.r4.xlarge = "26897232691"
cache.r4.2xlarge = "54191749857"
cache.r4.4xlarge = "108855946117"
cache.r4.8xlarge = "218248763146"
cache.r4.16xlarge = "437012922368"
cache.t2.micro = 595926712
cache.t2.small = 1664299827
cache.t2.medium = 3457448673
cache.m3.medium = 2985002270
cache.m3.large = 6496138035
cache.m3.xlarge = 14280766259
cache.m3.2xlarge = 29957396889
cache.m4.large = 6893422510
cache.m4.xlarge = 15333033246
cache.m4.2xlarge = 31890132172
cache.m4.4xlarge = 65262028062
cache.m4.10xlarge = 166043435663
cache.r3.large = 14495514624
cache.r3.xlarge = 30494267801
cache.r3.2xlarge = 62491774156
cache.r3.4xlarge = 126701535232
cache.r3.8xlarge = 254476812288
cache.r4.large = 13207024435
cache.r4.xlarge = 26897232691
cache.r4.2xlarge = 54191749857
cache.r4.4xlarge = 108855946117
cache.r4.8xlarge = 218248763146
cache.r4.16xlarge = 437012922368
}
}

View File

@ -117,13 +117,15 @@ resource "datadog_monitor" "memcached_free_memory" {
name = "[${var.environment}] Elasticache memcached free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
count = "${length(keys(local.memory))}"
type = "metric alert"
query = <<EOF
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
avg:aws.elasticache.freeable_memory{${data.template_file.filter.rendered}} by {region,cacheclusterid,cachenodeid} /
${local.memory[var.elasticache_size]} * 100
) < ${var.free_memory_threshold_critical}
avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_red:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} /
${element(values(local.memory), count.index)}
) * 100 < ${var.free_memory_threshold_critical}
EOF
thresholds {
@ -131,7 +133,7 @@ resource "datadog_monitor" "memcached_free_memory" {
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = true
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false

View File

@ -78,7 +78,7 @@ variable "cpu_high_message" {
variable "cpu_high_time_aggregator" {
description = "Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg]"
type = "string"
default = "min"
default = "avg"
}
variable "cpu_high_timeframe" {

View File

@ -26,7 +26,7 @@ resource "datadog_monitor" "redis_cache_hits" {
avg:aws.elasticache.cache_hits{${data.template_file.filter.rendered}} by {region,cacheclusterid}.as_count() /
(avg:aws.elasticache.cache_hits{${data.template_file.filter.rendered}} by {region,cacheclusterid}.as_count() +
avg:aws.elasticache.cache_misses{${data.template_file.filter.rendered}} by {region,cacheclusterid}.as_count())
) < ${var.cache_hits_threshold_critical}
) * 100 < ${var.cache_hits_threshold_critical}
EOF
thresholds {
@ -145,8 +145,8 @@ resource "datadog_monitor" "redis_commands" {
query = <<EOF
sum(${var.commands_timeframe}): (
avg:aws.elasticache.get_type_cmds{${data.template_file.filter.rendered}} by {region,cacheclusterid,cachenodeid}.as_count() +
avg:aws.elasticache.set_type_cmds{${data.template_file.filter.rendered}} by {region,cacheclusterid,cachenodeid}.as_count()
avg:aws.elasticache.get_type_cmds{${data.template_file.filter.rendered}} by {region,cacheclusterid}.as_count() +
avg:aws.elasticache.set_type_cmds{${data.template_file.filter.rendered}} by {region,cacheclusterid}.as_count()
) <= 0
EOF
@ -169,12 +169,14 @@ resource "datadog_monitor" "redis_free_memory" {
name = "[${var.environment}] Elasticache redis free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
count = "${length(keys(local.memory))}"
type = "metric alert"
query = <<EOF
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
avg:aws.elasticache.freeable_memory{${data.template_file.filter.rendered}} by {region,cacheclusterid,cachenodeid} /
( ${local.memory[var.elasticache_size]} / ${var.nodes} )
avg:aws.elasticache.freeable_memory{dd_monitoring:enabled,dd_aws_red:enabled,env:${var.environment},cache_node_type:${element(keys(local.memory), count.index)}} by {region,cacheclusterid,cachenodeid} /
( ${element(values(local.memory), count.index)} / ${var.nodes} )
) * 100 < ${var.free_memory_threshold_critical}
EOF
@ -183,7 +185,7 @@ resource "datadog_monitor" "redis_free_memory" {
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = true
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false