MON-32 - Free memory monitors added

This commit is contained in:
Alexandre Gaillet 2018-05-17 18:05:03 +02:00 committed by Quentin Manfroi
parent 7f60d8844d
commit f118f36161
9 changed files with 244 additions and 11 deletions

View File

@ -8,8 +8,9 @@ How to use this module
module "datadog-monitors-aws-elasticcache-redis" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/elasticache/memcached?ref={revision}"
message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}"
message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}"
elasticache_size = "${var.size_of_elsaticache}"
}
```
@ -21,6 +22,7 @@ Creates DataDog monitors with the following checks :
* Get Hit
* CPU High
* Swap
* Free memory
Inputs
------
@ -34,9 +36,16 @@ Inputs
| cpu_high_threshold_warning | Elasticache memcached cpu high warning threshold in percentage | string | `75` | no |
| cpu_high_timeframe | Monitor timeframe for Elasticache memcached cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| elasticache_size | Size of the Elasticache instance | string | - | yes |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_memory_aggregator | Monitor aggregator for Elasticache memcached free memory [available values: min, max, sum or avg] | string | `min` | no |
| free_memory_message | Custom message for Elasticache memcached free memory monitor | string | `` | no |
| free_memory_silenced | Groups to mute for Elasticache memcached free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Elasticache memcached free memory critical threshold in percentage | string | `5` | no |
| free_memory_threshold_warning | Elasticache memcached free memory warning threshold in percentage | string | `10` | no |
| free_memory_timeframe | Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| get_hits_aggregator | Monitor aggregator for Elasticache memcached get hits [available values: min, max, sum or avg] | string | `min` | no |
| get_hits_message | Custom message for Elasticache memcached get hits monitor | string | `` | no |
| get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `<map>` | no |

View File

@ -24,6 +24,12 @@ variable "filter_tags_custom" {
default = "*"
}
# elasticache variable
variable "elasticache_size" {
description = "Size of the Elasticache instance"
type = "string"
}
# Memcached specific
variable "get_hits_silenced" {
description = "Groups to mute for Elasticache memcached get hits monitor"
@ -123,3 +129,36 @@ variable "swap_threshold_critical" {
description = "Elasticache memcached swap critical threshold in percentage"
default = 50
}
variable "free_memory_silenced" {
description = "Groups to mute for Elasticache memcached free memory monitor"
type = "map"
default = {}
}
variable "free_memory_message" {
description = "Custom message for Elasticache memcached free memory monitor"
type = "string"
default = ""
}
variable "free_memory_aggregator" {
description = "Monitor aggregator for Elasticache memcached free memory [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache memcached free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_threshold_warning" {
description = "Elasticache memcached free memory warning threshold in percentage"
default = 10
}
variable "free_memory_threshold_critical" {
description = "Elasticache memcached free memory critical threshold in percentage"
default = 5
}

View File

@ -0,0 +1,31 @@
variable "memory" {
type = "map"
description = "Mapping between Elasticache size and Memory."
default = {
cache.t2.micro = "595926712"
cache.t2.small = "1664299827"
cache.t2.medium = "3457448673"
cache.m3.medium = "2985002270"
cache.m3.large = "6496138035"
cache.m3.xlarge = "14280766259"
cache.m3.2xlarge = "29957396889"
cache.m4.large = "6893422510"
cache.m4.xlarge = "15333033246"
cache.m4.2xlarge = "31890132172"
cache.m4.4xlarge = "65262028062"
cache.m4.10xlarge = "166043435663"
cache.r3.large = "14495514624"
cache.r3.xlarge = "30494267801"
cache.r3.2xlarge = "62491774156"
cache.r3.4xlarge = "126701535232"
cache.r3.8xlarge = "254476812288"
cache.r4.large = "13207024435"
cache.r4.xlarge = "26897232691"
cache.r4.2xlarge = "54191749857"
cache.r4.4xlarge = "108855946117"
cache.r4.8xlarge = "218248763146"
cache.r4.16xlarge = "437012922368"
}
}

View File

@ -112,3 +112,36 @@ resource "datadog_monitor" "memcached_swap" {
tags = ["env:${var.environment}", "resource:memcached", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "memcached_free_memory" {
name = "[${var.environment}] Elasticache memcached free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.free_memory_aggregator}(${var.free_memory_timeframe}): (
${var.free_memory_aggregator}:aws.elasticache.freeable_memory{${data.template_file.filter.rendered}} by {region,cluster,node} /
${var.memory[var.elasticache_size]} * 100
) < ${var.free_memory_threshold_critical}
EOF
thresholds {
warning = "${var.free_memory_threshold_warning}"
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
silenced = "${var.free_memory_silenced}"
tags = ["env:${var.environment}", "resource:memcached", "team:aws", "provider:aws"]
}

View File

@ -8,9 +8,19 @@ How to use this module
module "datadog-monitors-aws-elasticcache-redis" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/elasticache/redis?ref={revision}"
message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}"
redis_size = "${var.size_of_redis}"
message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}"
elasticache_size = "${var.size_of_elsaticache}"
nodes = "${data.my_cluster.num_cache_nodes}"
}
```
You can retrieve the number of nodes using the data source :
```
data "aws_elasticache_cluster" "my_cluster" {
cluster_id = "my-cluster-id"
}
```
@ -24,6 +34,7 @@ Creates DataDog monitors with the following checks:
* Commands received
* Replication lag
* Swap
* Free memory
Inputs
------
@ -47,11 +58,18 @@ Inputs
| cpu_high_threshold_warning | Elasticache redis cpu high warning threshold in percentage | string | `75` | no |
| cpu_high_timeframe | Monitor timeframe for Elasticache redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| elasticache_size | Size of the Elasticache instance | string | - | yes |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_memory_aggregator | Monitor aggregator for Elasticache redis free memory [available values: min, max, sum or avg] | string | `min` | no |
| free_memory_message | Custom message for Elasticache redis free memory monitor | string | `` | no |
| free_memory_silenced | Groups to mute for Elasticache redis free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Elasticache redis free memory critical threshold in percentage | string | `5` | no |
| free_memory_threshold_warning | Elasticache redis free memory warning threshold in percentage | string | `10` | no |
| free_memory_timeframe | Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| redis_size | Size of the Elasticache redis instance | string | - | yes |
| nodes | Number of Elasticache nodes | string | - | yes |
| replication_lag_aggregator | Monitor aggregator for Elasticache redis replication lag [available values: min, max, sum or avg] | string | `min` | no |
| replication_lag_message | Custom message for Elasticache redis replication lag monitor | string | `` | no |
| replication_lag_silenced | Groups to mute for Elasticache redis replication lag monitor | map | `<map>` | no |

View File

@ -1,7 +1,7 @@
variable "core" {
type = "map"
description = "Mapping between Redis size and vCPU."
description = "Mapping between Elasticache size and vCPU."
default = {
cache.t2.micro = "1"

View File

@ -24,12 +24,18 @@ variable "filter_tags_custom" {
default = "*"
}
# redis specific
variable "redis_size" {
description = "Size of the Elasticache redis instance"
# elasticache variable
variable "elasticache_size" {
description = "Size of the Elasticache instance"
type = "string"
}
variable "nodes" {
description = "Number of Elasticache nodes"
type = "string"
}
# redis specific
variable "cache_hits_silenced" {
description = "Groups to mute for Elasticache redis cache hits monitor"
type = "map"
@ -174,3 +180,36 @@ variable "commands_timeframe" {
description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_5m"
}
variable "free_memory_silenced" {
description = "Groups to mute for Elasticache redis free memory monitor"
type = "map"
default = {}
}
variable "free_memory_message" {
description = "Custom message for Elasticache redis free memory monitor"
type = "string"
default = ""
}
variable "free_memory_aggregator" {
description = "Monitor aggregator for Elasticache redis free memory [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache redis free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m"
}
variable "free_memory_threshold_warning" {
description = "Elasticache redis free memory warning threshold in percentage"
default = 10
}
variable "free_memory_threshold_critical" {
description = "Elasticache redis free memory critical threshold in percentage"
default = 5
}

View File

@ -0,0 +1,31 @@
variable "memory" {
type = "map"
description = "Mapping between Elasticache size and Memory."
default = {
cache.t2.micro = "595926712"
cache.t2.small = "1664299827"
cache.t2.medium = "3457448673"
cache.m3.medium = "2985002270"
cache.m3.large = "6496138035"
cache.m3.xlarge = "14280766259"
cache.m3.2xlarge = "29957396889"
cache.m4.large = "6893422510"
cache.m4.xlarge = "15333033246"
cache.m4.2xlarge = "31890132172"
cache.m4.4xlarge = "65262028062"
cache.m4.10xlarge = "166043435663"
cache.r3.large = "14495514624"
cache.r3.xlarge = "30494267801"
cache.r3.2xlarge = "62491774156"
cache.r3.4xlarge = "126701535232"
cache.r3.8xlarge = "254476812288"
cache.r4.large = "13207024435"
cache.r4.xlarge = "26897232691"
cache.r4.2xlarge = "54191749857"
cache.r4.4xlarge = "108855946117"
cache.r4.8xlarge = "218248763146"
cache.r4.16xlarge = "437012922368"
}
}

View File

@ -58,7 +58,7 @@ resource "datadog_monitor" "redis_cpu_high" {
query = <<EOF
${var.cpu_high_aggregator}(${var.cpu_high_timeframe}): (
${var.cpu_high_aggregator}:aws.elasticache.cpuutilization{${data.template_file.filter.rendered}} by {region,cluster,node}
) > ( ${var.cpu_high_threshold_critical} / ${var.core[var.redis_size]} )
) > ( ${var.cpu_high_threshold_critical} / ${var.core[var.elasticache_size]} )
EOF
thresholds {
@ -167,3 +167,36 @@ resource "datadog_monitor" "redis_commands" {
tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "redis_free_memory" {
name = "[${var.environment}] Elasticache redis free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.free_memory_aggregator}(${var.free_memory_timeframe}): (
${var.free_memory_aggregator}:aws.elasticache.freeable_memory{${data.template_file.filter.rendered}} by {region,cluster,node} /
( ${var.memory[var.elasticache_size]} / ${var.nodes} )
) * 100 < ${var.free_memory_threshold_critical}
EOF
thresholds {
warning = "${var.free_memory_threshold_warning}"
critical = "${var.free_memory_threshold_critical}"
}
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
silenced = "${var.free_memory_silenced}"
tags = ["env:${var.environment}", "resource:redis", "team:aws", "provider:aws"]
}