MON-271: More Redis monitors

This commit is contained in:
Jérôme Respaut 2018-07-20 18:07:56 +02:00 committed by Quentin Manfroi
parent ac4ec07788
commit 7224c97ac5
4 changed files with 397 additions and 0 deletions

View File

@ -20,6 +20,11 @@ Creates DataDog monitors with the following checks:
- Redis too many expired keys - Redis too many expired keys
- Redis too many blocked clients - Redis too many blocked clients
- Redis keyspace seems full - Redis keyspace seems full
- Redis too many ram memory used
- Redis memory ram fragmented
- Redis too many rejected connections
- Redis latency is too high
- Redis hitrate is too low
## Inputs ## Inputs
@ -47,14 +52,44 @@ Creates DataDog monitors with the following checks:
| expirations_silenced | Groups to mute for Redis keys expirations monitor | map | `<map>` | no | | expirations_silenced | Groups to mute for Redis keys expirations monitor | map | `<map>` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| hitrate_message | Custom message for Redis hitrate monitor | string | `` | no |
| hitrate_silenced | Groups to mute for Redis hitrate monitor | map | `<map>` | no |
| hitrate_threshold_critical | hitrate limit (critical threshold) | string | `90` | no |
| hitrate_threshold_warning | hitrate limit (warning threshold) | string | `70` | no |
| hitrate_time_aggregator | Monitor aggregator for Redis hitrate [available values: min, max or avg] | string | `min` | no |
| hitrate_timeframe | Monitor timeframe for Redis hitrate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| keyspace_message | Custom message for Redis keyspace monitor | string | `` | no | | keyspace_message | Custom message for Redis keyspace monitor | string | `` | no |
| keyspace_silenced | Groups to mute for Redis keyspace monitor | map | `<map>` | no | | keyspace_silenced | Groups to mute for Redis keyspace monitor | map | `<map>` | no |
| keyspace_threshold_critical | Keyspace changement (critical threshold) | string | `90` | no | | keyspace_threshold_critical | Keyspace changement (critical threshold) | string | `90` | no |
| keyspace_threshold_warning | Keyspace changement (warning threshold) | string | `70` | no | | keyspace_threshold_warning | Keyspace changement (warning threshold) | string | `70` | no |
| keyspace_time_aggregator | Monitor aggregator for Redis keyspace [available values: min, max or avg] | string | `min` | no | | keyspace_time_aggregator | Monitor aggregator for Redis keyspace [available values: min, max or avg] | string | `min` | no |
| keyspace_timeframe | Monitor timeframe for Redis keyspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | keyspace_timeframe | Monitor timeframe for Redis keyspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| latency_message | Custom message for Redis latency monitor | string | `` | no |
| latency_silenced | Groups to mute for Redis latency monitor | map | `<map>` | no |
| latency_threshold_critical | latency limit (critical threshold) | string | `90` | no |
| latency_threshold_warning | latency limit (warning threshold) | string | `70` | no |
| latency_time_aggregator | Monitor aggregator for Redis latency [available values: min, max or avg] | string | `min` | no |
| latency_timeframe | Monitor timeframe for Redis latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| mem_frag_message | Custom message for Redis memory RAM fragmentation monitor | string | `` | no |
| mem_frag_silenced | Groups to mute for Redis memory RAM fragmentation monitor | map | `<map>` | no |
| mem_frag_threshold_critical | memory RAM fragmentation limit (critical threshold) | string | `90` | no |
| mem_frag_threshold_warning | memory RAM fragmentation limit (warning threshold) | string | `70` | no |
| mem_frag_time_aggregator | Monitor aggregator for Redis memory RAM fragmentation [available values: min, max or avg] | string | `min` | no |
| mem_frag_timeframe | Monitor timeframe for Redis memory RAM fragmentation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| mem_used_message | Custom message for Redis RAM memory used monitor | string | `` | no |
| mem_used_silenced | Groups to mute for Redis RAM memory used monitor | map | `<map>` | no |
| mem_used_threshold_critical | RAM memory used limit (critical threshold) | string | `90` | no |
| mem_used_threshold_warning | RAM memory used limit (warning threshold) | string | `70` | no |
| mem_used_time_aggregator | Monitor aggregator for Redis RAM memory used [available values: min, max or avg] | string | `min` | no |
| mem_used_timeframe | Monitor timeframe for Redis RAM memory used [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when a Redis monitor is triggered | string | - | yes | | message | Message sent when a Redis monitor is triggered | string | - | yes |
| redis_silenced | Groups to mute for Redis monitors | map | `<map>` | no | | redis_silenced | Groups to mute for Redis monitors | map | `<map>` | no |
| rejected_con_message | Custom message for Redis rejected connections errors monitor | string | `` | no |
| rejected_con_silenced | Groups to mute for Redis rejected connections errors monitor | map | `<map>` | no |
| rejected_con_threshold_critical | rejected connections errors limit (critical threshold) | string | `90` | no |
| rejected_con_threshold_warning | rejected connections errors limit (warning threshold) | string | `70` | no |
| rejected_con_time_aggregator | Monitor aggregator for Redis rejected connections errors [available values: min, max or avg] | string | `min` | no |
| rejected_con_timeframe | Monitor timeframe for Redis rejected connections errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
## Outputs ## Outputs
@ -63,7 +98,12 @@ Creates DataDog monitors with the following checks:
| redis_blocked_clients_id | id for monitor redis_blocked_clients | | redis_blocked_clients_id | id for monitor redis_blocked_clients |
| redis_evictedkeys_id | id for monitor redis_evictedkeys | | redis_evictedkeys_id | id for monitor redis_evictedkeys |
| redis_expirations_id | id for monitor redis_expirations | | redis_expirations_id | id for monitor redis_expirations |
| redis_hitrate_id | id for monitor redis_hitrate |
| redis_keyspace_id | id for monitor redis_keyspace | | redis_keyspace_id | id for monitor redis_keyspace |
| redis_latency_id | id for monitor redis_latency |
| redis_mem_frag_id | id for monitor redis_mem_frag |
| redis_mem_used_id | id for monitor redis_mem_used |
| redis_rejected_con_id | id for monitor redis_rejected_con |
## Related documentation ## Related documentation

View File

@ -166,3 +166,173 @@ variable "keyspace_threshold_warning" {
description = "Keyspace changement (warning threshold)" description = "Keyspace changement (warning threshold)"
default = 70 default = 70
} }
variable "mem_used_silenced" {
description = "Groups to mute for Redis RAM memory used monitor"
type = "map"
default = {}
}
variable "mem_used_message" {
description = "Custom message for Redis RAM memory used monitor"
type = "string"
default = ""
}
variable "mem_used_time_aggregator" {
description = "Monitor aggregator for Redis RAM memory used [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "mem_used_timeframe" {
description = "Monitor timeframe for Redis RAM memory used [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "mem_used_threshold_critical" {
description = "RAM memory used limit (critical threshold)"
default = 90
}
variable "mem_used_threshold_warning" {
description = "RAM memory used limit (warning threshold)"
default = 70
}
variable "mem_frag_silenced" {
description = "Groups to mute for Redis memory RAM fragmentation monitor"
type = "map"
default = {}
}
variable "mem_frag_message" {
description = "Custom message for Redis memory RAM fragmentation monitor"
type = "string"
default = ""
}
variable "mem_frag_time_aggregator" {
description = "Monitor aggregator for Redis memory RAM fragmentation [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "mem_frag_timeframe" {
description = "Monitor timeframe for Redis memory RAM fragmentation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "mem_frag_threshold_critical" {
description = "memory RAM fragmentation limit (critical threshold)"
default = 90
}
variable "mem_frag_threshold_warning" {
description = "memory RAM fragmentation limit (warning threshold)"
default = 70
}
variable "rejected_con_silenced" {
description = "Groups to mute for Redis rejected connections errors monitor"
type = "map"
default = {}
}
variable "rejected_con_message" {
description = "Custom message for Redis rejected connections errors monitor"
type = "string"
default = ""
}
variable "rejected_con_time_aggregator" {
description = "Monitor aggregator for Redis rejected connections errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "rejected_con_timeframe" {
description = "Monitor timeframe for Redis rejected connections errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "rejected_con_threshold_critical" {
description = "rejected connections errors limit (critical threshold)"
default = 90
}
variable "rejected_con_threshold_warning" {
description = "rejected connections errors limit (warning threshold)"
default = 70
}
variable "latency_silenced" {
description = "Groups to mute for Redis latency monitor"
type = "map"
default = {}
}
variable "latency_message" {
description = "Custom message for Redis latency monitor"
type = "string"
default = ""
}
variable "latency_time_aggregator" {
description = "Monitor aggregator for Redis latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "latency_timeframe" {
description = "Monitor timeframe for Redis latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "latency_threshold_critical" {
description = "latency limit (critical threshold)"
default = 90
}
variable "latency_threshold_warning" {
description = "latency limit (warning threshold)"
default = 70
}
variable "hitrate_silenced" {
description = "Groups to mute for Redis hitrate monitor"
type = "map"
default = {}
}
variable "hitrate_message" {
description = "Custom message for Redis hitrate monitor"
type = "string"
default = ""
}
variable "hitrate_time_aggregator" {
description = "Monitor aggregator for Redis hitrate [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "hitrate_timeframe" {
description = "Monitor timeframe for Redis hitrate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "hitrate_threshold_critical" {
description = "hitrate limit (critical threshold)"
default = 90
}
variable "hitrate_threshold_warning" {
description = "hitrate limit (warning threshold)"
default = 70
}

View File

@ -133,3 +133,165 @@ EOL
tags = ["env:${var.environment}", "resource:redis"] tags = ["env:${var.environment}", "resource:redis"]
} }
resource "datadog_monitor" "redis_mem_used" {
name = "[${var.environment}] Redis too many ram memory used {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.mem_used_message, var.message)}"
query = <<EOL
${var.mem_used_time_aggregator}(${var.mem_used_timeframe}): (
avg:redis.mem.used{${data.template_file.filter.rendered}} by {name,host}
) > ${var.mem_used_threshold_critical}
EOL
type = "metric alert"
thresholds {
warning = "${var.mem_used_threshold_warning}"
critical = "${var.mem_used_threshold_critical}"
}
silenced = "${var.mem_used_silenced}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
tags = ["env:${var.environment}", "resource:redis"]
}
resource "datadog_monitor" "redis_mem_frag" {
name = "[${var.environment}] Redis memory ram fragmented {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.mem_frag_message, var.message)}"
query = <<EOL
${var.mem_frag_time_aggregator}(${var.mem_frag_timeframe}): (
avg:redis.mem.fragmentation_ratio{${data.template_file.filter.rendered}} by {name,host}
) > ${var.mem_frag_threshold_critical}
EOL
type = "metric alert"
thresholds {
warning = "${var.mem_frag_threshold_warning}"
critical = "${var.mem_frag_threshold_critical}"
}
silenced = "${var.mem_frag_silenced}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
tags = ["env:${var.environment}", "resource:redis"]
}
resource "datadog_monitor" "redis_rejected_con" {
name = "[${var.environment}] Redis too many rejected connections {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.rejected_con_message, var.message)}"
query = <<EOL
pct_change(${var.rejected_con_time_aggregator}(${var.rejected_con_timeframe}),${var.rejected_con_timeframe}): (
avg:redis.net.rejected{${data.template_file.filter.rendered}} by {name,host}
) > ${var.rejected_con_threshold_critical}
EOL
type = "metric alert"
thresholds {
warning = "${var.rejected_con_threshold_warning}"
critical = "${var.rejected_con_threshold_critical}"
}
silenced = "${var.rejected_con_silenced}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
tags = ["env:${var.environment}", "resource:redis"]
}
resource "datadog_monitor" "redis_latency" {
name = "[${var.environment}] Redis latency is too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOL
${var.latency_time_aggregator}(${var.latency_timeframe}): (
avg:redis.info.latency_ms{${data.template_file.filter.rendered}} by {name,host}
) > ${var.latency_threshold_critical}
EOL
type = "metric alert"
thresholds {
warning = "${var.latency_threshold_warning}"
critical = "${var.latency_threshold_critical}"
}
silenced = "${var.latency_silenced}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
tags = ["env:${var.environment}", "resource:redis"]
}
resource "datadog_monitor" "redis_hitrate" {
name = "[${var.environment}] Redis hitrate is too low {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.hitrate_message, var.message)}"
query = <<EOL
${var.hitrate_time_aggregator}(${var.hitrate_timeframe}): (
avg:redis.stats.keyspace_hits{${data.template_file.filter.rendered}} by {name,host}
/ (avg:redis.stats.keyspace_hits{${data.template_file.filter.rendered}} by {name,host}
+ avg:redis.stats.keyspace_misses{${data.template_file.filter.rendered}} by {name,host})
) < ${var.hitrate_threshold_critical}
EOL
type = "metric alert"
thresholds {
warning = "${var.hitrate_threshold_warning}"
critical = "${var.hitrate_threshold_critical}"
}
silenced = "${var.hitrate_silenced}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.delay}"
tags = ["env:${var.environment}", "resource:redis"]
}

View File

@ -17,3 +17,28 @@ output "redis_keyspace_id" {
description = "id for monitor redis_keyspace" description = "id for monitor redis_keyspace"
value = "${datadog_monitor.redis_keyspace.id}" value = "${datadog_monitor.redis_keyspace.id}"
} }
output "redis_mem_used_id" {
description = "id for monitor redis_mem_used"
value = "${datadog_monitor.redis_mem_used.id}"
}
output "redis_mem_frag_id" {
description = "id for monitor redis_mem_frag"
value = "${datadog_monitor.redis_mem_frag.id}"
}
output "redis_rejected_con_id" {
description = "id for monitor redis_rejected_con"
value = "${datadog_monitor.redis_rejected_con.id}"
}
output "redis_latency_id" {
description = "id for monitor redis_latency"
value = "${datadog_monitor.redis_latency.id}"
}
output "redis_hitrate_id" {
description = "id for monitor redis_hitrate"
value = "${datadog_monitor.redis_hitrate.id}"
}