From cf66c5c7f39adcaa2a7c6f8fb85f5a4984e17c95 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 15 May 2018 18:33:56 +0200 Subject: [PATCH] MON-32 - Memcached monitors added --- cloud/aws/elasticache/common/README.md | 4 +- cloud/aws/elasticache/memcached/README.md | 25 +++++--- cloud/aws/elasticache/memcached/inputs.tf | 61 ++++++++++++++----- .../memcached/monitors-memcached.tf | 58 ++++++++++++++---- 4 files changed, 113 insertions(+), 35 deletions(-) diff --git a/cloud/aws/elasticache/common/README.md b/cloud/aws/elasticache/common/README.md index ec333e4..56c77d3 100644 --- a/cloud/aws/elasticache/common/README.md +++ b/cloud/aws/elasticache/common/README.md @@ -16,9 +16,11 @@ module "datadog-monitors-aws-elasticache" { ``` +This module is used by default when you define `memcached` or `redis` monitors + Purpose ------- -Creates DataDog monitors with the following checks : +Creates DataDog monitors with the following checks: * Eviction diff --git a/cloud/aws/elasticache/memcached/README.md b/cloud/aws/elasticache/memcached/README.md index c73e482..3e280d3 100644 --- a/cloud/aws/elasticache/memcached/README.md +++ b/cloud/aws/elasticache/memcached/README.md @@ -18,28 +18,39 @@ Purpose ------- Creates DataDog monitors with the following checks : -* Get requests missed +* CPU High +* Swap Inputs ------ | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| cpu_high_aggregator | Monitor aggregator for Elasticache memcached cpu high [available values: min, max, sum or avg] | string | `min` | no | +| cpu_high_message | Custom message for Elasticache memcached cpu high monitor | string | `` | no | +| cpu_high_silenced | Groups to mute for Elasticache memcached cpu high monitor | map | `` | no | +| cpu_high_threshold_critical | Elasticache memcached cpu high critical threshold in percentage | string | `90` | no | +| cpu_high_threshold_warning | Elasticache memcached cpu high warning threshold in percentage | string | `75` | no | +| cpu_high_timeframe | Monitor timeframe for Elasticache memcached cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture Environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| get_requests_miss_aggregator | Monitor aggregator for Elasticache Memcached get requests missed [available values: min, max, sum or avg] | string | `min` | no | -| get_requests_miss_message | Custom message for Elasticache Memcached get requests missed monitor | string | `` | no | -| get_requests_miss_silenced | Groups to mute for Elasticache Memcached get requests missed monitor | map | `` | no | -| get_requests_miss_threshold_critical | Elasticache Memcached get requests missed critical threshold in percentage | string | `95` | no | -| get_requests_miss_threshold_warning | Elasticache Memcached get requests missed warning threshold in percentage | string | `80` | no | -| get_requests_miss_timeframe | Monitor timeframe for Elasticache Memcached get requests missed [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | message | Message sent when an alert is triggered | string | - | yes | +| swap_aggregator | Monitor aggregator for Elasticache memcached swap [available values: min, max, sum or avg] | string | `min` | no | +| swap_message | Custom message for Elasticache memcached swap monitor | string | `` | no | +| swap_silenced | Groups to mute for Elasticache memcached swap monitor | map | `` | no | +| swap_threshold_critical | Elasticache memcached swap critical threshold in percentage | string | `50` | no | +| swap_threshold_warning | Elasticache memcached swap warning threshold in percentage | string | `0` | no | +| swap_timeframe | Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | Related documentation --------------------- DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_elasticache/](https://docs.datadoghq.com/integrations/amazon_elasticache/) +And more here: + +* [https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/](https://www.datadoghq.com/blog/monitoring-elasticache-performance-metrics-with-redis-or-memcached/) + AWS ElasticSearch Service Instance metrics documentation: [https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/elasticache-metricscollected.html](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/elasticache-metricscollected.html) diff --git a/cloud/aws/elasticache/memcached/inputs.tf b/cloud/aws/elasticache/memcached/inputs.tf index 987e401..8510bd1 100644 --- a/cloud/aws/elasticache/memcached/inputs.tf +++ b/cloud/aws/elasticache/memcached/inputs.tf @@ -25,35 +25,68 @@ variable "filter_tags_custom" { } # Memcached specific -variable "get_requests_miss_silenced" { - description = "Groups to mute for Elasticache Memcached get requests missed monitor" +variable "cpu_high_silenced" { + description = "Groups to mute for Elasticache memcached cpu high monitor" type = "map" default = {} } -variable "get_requests_miss_message" { - description = "Custom message for Elasticache Memcached get requests missed monitor" +variable "cpu_high_message" { + description = "Custom message for Elasticache memcached cpu high monitor" type = "string" default = "" } -variable "get_requests_miss_aggregator" { - description = "Monitor aggregator for Elasticache Memcached get requests missed [available values: min, max, sum or avg]" +variable "cpu_high_aggregator" { + description = "Monitor aggregator for Elasticache memcached cpu high [available values: min, max, sum or avg]" type = "string" default = "min" } -variable "get_requests_miss_timeframe" { - description = "Monitor timeframe for Elasticache Memcached get requests missed [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" +variable "cpu_high_timeframe" { + description = "Monitor timeframe for Elasticache memcached cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_15m" } -variable "get_requests_miss_threshold_warning" { - description = "Elasticache Memcached get requests missed warning threshold in percentage" - default = 80 +variable "cpu_high_threshold_warning" { + description = "Elasticache memcached cpu high warning threshold in percentage" + default = 75 } -variable "get_requests_miss_threshold_critical" { - description = "Elasticache Memcached get requests missed critical threshold in percentage" - default = 95 +variable "cpu_high_threshold_critical" { + description = "Elasticache memcached cpu high critical threshold in percentage" + default = 90 +} + +variable "swap_silenced" { + description = "Groups to mute for Elasticache memcached swap monitor" + type = "map" + default = {} +} + +variable "swap_message" { + description = "Custom message for Elasticache memcached swap monitor" + type = "string" + default = "" +} + +variable "swap_aggregator" { + description = "Monitor aggregator for Elasticache memcached swap [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "swap_timeframe" { + description = "Monitor timeframe for Elasticache memcached swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" +} + +variable "swap_threshold_warning" { + description = "Elasticache memcached swap warning threshold in percentage" + default = 0 +} + +variable "swap_threshold_critical" { + description = "Elasticache memcached swap critical threshold in percentage" + default = 50 } diff --git a/cloud/aws/elasticache/memcached/monitors-memcached.tf b/cloud/aws/elasticache/memcached/monitors-memcached.tf index 7cbe296..62264ad 100644 --- a/cloud/aws/elasticache/memcached/monitors-memcached.tf +++ b/cloud/aws/elasticache/memcached/monitors-memcached.tf @@ -6,30 +6,30 @@ data "template_file" "filter" { } } -module "datadog-monitors-aws-elasticcache" { - source = "../." +module "datadog-monitors-aws-elasticcache-common" { + source = "../common" message = "${var.message}" environment = "${var.environment}" + filter_tags = "${data.template_file.filter.rendered}" + resource = "memcached" } -resource "datadog_monitor" "memcached_get_miss" { - name = "[${var.environment}] Elasticache Memcached get requests missed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" - message = "${coalesce(var.get_requests_miss_message, var.message)}" +resource "datadog_monitor" "memcached_cpu_high" { + name = "[${var.environment}] Elasticache memcached CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_high_message, var.message)}" type = "metric alert" query = < ${var.get_requests_miss_threshold_critical} + ${var.cpu_high_aggregator}(${var.cpu_high_timeframe}): ( + ${var.cpu_high_aggregator}:aws.elasticache.cpuutilization{${data.template_file.filter.rendered}} by {region,cluster,node} + ) > ${var.cpu_high_threshold_critical} EOF thresholds { - warning = "${var.get_requests_miss_threshold_warning}" - critical = "${var.get_requests_miss_threshold_critical}" + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" } notify_no_data = true @@ -42,7 +42,39 @@ resource "datadog_monitor" "memcached_get_miss" { require_full_window = false new_host_delay = "${var.delay}" - silenced = "${var.get_requests_miss_silenced}" + silenced = "${var.cpu_high_silenced}" + + tags = ["env:${var.environment}", "resource:memcached", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "memcached_swap" { + name = "[${var.environment}] Elasticache memcached swap {{#is_alert}}{{{comparator}}} {{threshold}}MB ({{value}}MB){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}MB ({{value}}MB){{/is_warning}}" + message = "${coalesce(var.swap_message, var.message)}" + + type = "metric alert" + + query = < ${var.swap_threshold_critical} + EOF + + thresholds { + warning = "${var.swap_threshold_warning}" + critical = "${var.swap_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.delay}" + + silenced = "${var.swap_silenced}" tags = ["env:${var.environment}", "resource:memcached", "team:aws", "provider:aws"] }