MON-199 Change MongoDB secondary monitoring

This commit is contained in:
Laurent Piroelle 2018-06-26 15:21:33 +02:00
parent 38177e3670
commit ac8585e441
2 changed files with 68 additions and 6 deletions

View File

@ -24,6 +24,11 @@ variable "filter_tags_custom" {
default = "*"
}
variable "mongodb_desired_servers_count" {
description = "Number of servers that should be instanciated for this cluster"
default = 3
}
variable "mongodb_primary_timeframe" {
description = "Monitor timeframe for MongoDB wrong state for primary node [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -33,7 +38,13 @@ variable "mongodb_primary_timeframe" {
variable "mongodb_secondary_timeframe" {
description = "Monitor timeframe for MongoDB wrong state for secondaries nodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
default = "last_5m"
}
variable "mongodb_server_count_timeframe" {
description = "Monitor timeframe for MongoDB wrong state for secondaries nodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_15m"
}
variable "mongodb_replication_timeframe" {
@ -64,6 +75,12 @@ variable "mongodb_secondary_silenced" {
default = {}
}
variable "mongodb_server_count_silenced" {
description = "Groups to mute for Mongodb secondary state monitor"
type = "map"
default = {}
}
variable "mongodb_replication_silenced" {
description = "Groups to mute for Mongodb replication lag monitor"
type = "map"
@ -82,6 +99,12 @@ variable "mongodb_secondary_message" {
default = ""
}
variable "mongodb_server_count_message" {
description = "Custom message for MongoDB secondary monitor"
type = "string"
default = ""
}
variable "mongodb_replication_message" {
description = "Custom message for MongoDB replication monitor"
type = "string"
@ -100,6 +123,12 @@ variable "mongodb_secondary_aggregator" {
default = "max"
}
variable "mongodb_server_count_aggregator" {
description = "Monitor aggregator for Mongodb secondary state [available values: min, max]"
type = "string"
default = "max"
}
variable "mongodb_replication_aggregator" {
description = "Monitor aggregator for Mongodb replication lag [available values: min, max, sum or avg]"
type = "string"

View File

@ -32,22 +32,55 @@ resource "datadog_monitor" "mongodb_primary" {
}
resource "datadog_monitor" "mongodb_secondary" {
name = "[${var.environment}] MongoDB secondary state"
name = "[${var.environment}] MongoDB secondary missing"
message = "${coalesce(var.mongodb_secondary_message, var.message)}"
query = <<EOF
${var.mongodb_secondary_aggregator}(${var.mongodb_secondary_timeframe}):
max:mongodb.replset.state{${data.template_file.filter.rendered},replset_state:secondary} by {server} >= 6
${var.mongodb_desired_servers_count} -
sum:mongodb.replset.health{${data.template_file.filter.rendered}} by {replset_name}
> 1
EOF
thresholds {
critical = 6
warning = 3
critical = 1
warning = 0
}
type = "metric alert"
notify_no_data = true
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = true
silenced = "${var.mongodb_secondary_silenced}"
tags = ["env:${var.environment}", "resource:mongodb"]
}
resource "datadog_monitor" "mongodb_server_count" {
name = "[${var.environment}] MongoDB too much servers or wrong monitoring config"
message = "${coalesce(var.mongodb_server_count_message, var.message)}"
query = <<EOF
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
sum:mongodb.replset.health{${data.template_file.filter.rendered}} by {replset_name}
> 99
EOF
thresholds {
critical = 99
warning = "${var.mongodb_desired_servers_count}"
}
type = "metric alert"
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"