MON-199 Advanced monitors for Mongo

This commit is contained in:
Christophe GENINET 2018-05-16 14:57:25 +02:00
parent fee7deb0d3
commit 38177e3670
3 changed files with 217 additions and 67 deletions

View File

@ -1,58 +1,21 @@
AWS MongoDB Service DataDog monitors
==========================================
Link to integration documentation :
[https://docs.datadoghq.com/integrations/mongo/](https://docs.datadoghq.com/integrations/mongo/)
**Prepare your ReplicaSet** :
Add a user to your ReplicaSet (on the primary instance)
```
use admin
db.auth("admin", "admin-password") ## This is optional is you don't have any admin password
db.createUser({"user":"datadog", "pwd": "{{PASSWORD}}", "roles" : [ {role: 'read', db: 'admin' }, {role: 'clusterMonitor', db: 'admin'}, {role: 'read', db: 'local' }]})
```
**Configure your Datadog agent**
Add this file conf.d/mongo.yaml
```
init_config:
instances:
- server: mongodb://datadog:password@[MONGO_URI]
tags:
- mytag1
- mytag2
- server: mongodb://datadog:password@[MONGO_URI]
tags:
- mytag1
- mytag2
```
**Monitor ReplicaSet Health**
Name: [environment] Replica Set heath for {{ replset_name }}
This monitor will check the health of your ReplicaSet
Metrics are :
1: The replicaSet is OK
0: The replicaSet is KO
This monitor will trigger an alert for each ReplicaSet.
How to use this module
----------------------
Add a user to MongoDB (on the primary instance) :
```
use admin
db.auth("admin", "admin-password") ## This is optional is you don't have any admin password
db.createUser({"user":"datadog", "pwd": "{{PASSWORD}}", "roles" : [ {role: 'read', db: 'admin' }, {role: 'clusterMonitor', db: 'admin'}, {role: 'read', db: 'local' }]})
```
Add a module in your Terraform project :
```
module "datadog-monitors-aws-mongodb" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//databases/mongodb?ref={revision}"
@ -63,22 +26,86 @@ module "datadog-monitors-aws-mongodb" {
}
```
Configure your Datadog agent for kubernetes with this config :
```
datadog:
confd:
mongo.yaml: |-
ad_identifiers:
- mongodb
init_config:
instances:
- server: mongodb://datadog:password@%%host%%/admin
tags:
- dd_monitoring:enabled
- dd_mongodb:enabled
- env:prod
```
Purpose
-------
Creates a DataDog monitors with the following checks :
* Mongodb ReplicaSet status
* MongoDB Primary status
* MongoDB Secondaries status
* MongoDB replication lag
**Monitor MongoDB Primary**
Name: [environment] MongoDB Primary
This monitor will check the health of the Primary node
This monitor will trigger an alert if there's no primary or if the primary state is wrong.
**Monitor MongoDB Secondary**
Name: [environment] MongoDB Secondary
This monitor will check the health for secondaries nodes
This monitor will trigger an alert if a secondary is missing or if there's a wrong state
**Monitor MongoDB Replication lag**
Name: [environment] MongoDB Replication lag
This monitor will check the replication lag
This monitor will trigger an alert if the replication high is too high
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| mongodb_replicaset_message | Custom message for Mongodb replicaset monitor | string | `` | no |
| mongodb_replicaset_silenced | Groups to mute for Mongodb replicaset monitor | map | `<map>` | no |
| mongodb_replicaset_timeframe | Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| mongodb_primary_message | Message sent when an alert is triggered on primary state | string | - | no |
| mongodb_secondary_message | Message sent when an alert is triggered on secondary state | string | - | no |
| mongodb_replication_message | Message sent when an alert is triggered on replication lag | string | - | no |
| mongodb_lag_warning | Replication lag in seconds to trigger a warn alert | string | 2 | no |
| mongodb_lag_critical | Replication lag in seconds to trigger a critical alert | string | 5 | no |
| mongodb_primary_silenced | Groups to mute for Mongodb primary state monitor | map | `<map>` | no |
| mongodb_secondary_silenced | Groups to mute for Mongodb secondary state monitor | map | `<map>` | no |
| mongodb_replication_silenced | Groups to mute for Mongodb replication lag monitor | map | `<map>` | no |
| mongodb_replication_aggregator | Monitor aggregator for Mongodb state on primary node | string | available values: min, max | no |
| mongodb_replication_aggregator | Monitor aggregator for Mongodb state for secondaries | string | available values: min, max | no |
| mongodb_replication_aggregator | Monitor aggregator for Mongodb replication lag | string | available values: min, max, sum or avg | no |
| mongodb_primary_timeframe | Time frame for MongoDB primary state | string | available values: `last_#m` (1, 5, 10, 15, or 30) | no |
| mongodb_secondary_timeframe | Time frame for MongoDB secondary state | string | available values: `last_#m` (1, 5, 10, 15, or 30) | no |
| mongodb_replication_timeframe | Time frame for MongoDB replication lag | string | available values: `last_#m` (1, 5, 10, 15, or 30) | no |
Related documentation
---------------------
[https://docs.datadoghq.com/integrations/mongo/](https://docs.datadoghq.com/integrations/mongo/)

View File

@ -24,20 +24,84 @@ variable "filter_tags_custom" {
default = "*"
}
variable "mongodb_replicaset_silenced" {
description = "Groups to mute for Mongodb replicaset monitor"
variable "mongodb_primary_timeframe" {
description = "Monitor timeframe for MongoDB wrong state for primary node [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
}
variable "mongodb_secondary_timeframe" {
description = "Monitor timeframe for MongoDB wrong state for secondaries nodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
}
variable "mongodb_replication_timeframe" {
description = "Monitor timeframe for MongoDB replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
}
variable "mongodb_lag_warning" {
description = "Warn replication lag in s"
default = 2
}
variable "mongodb_lag_critical" {
description = "Critical replication lag in s"
default = 5
}
variable "mongodb_primary_silenced" {
description = "Groups to mute for Mongodb primary state monitor"
type = "map"
default = {}
}
variable "mongodb_replicaset_message" {
description = "Custom message for Mongodb replicaset monitor"
variable "mongodb_secondary_silenced" {
description = "Groups to mute for Mongodb secondary state monitor"
type = "map"
default = {}
}
variable "mongodb_replication_silenced" {
description = "Groups to mute for Mongodb replication lag monitor"
type = "map"
default = {}
}
variable "mongodb_primary_message" {
description = "Custom message for MongoDB primary monitor"
type = "string"
default = ""
}
variable "mongodb_replicaset_timeframe" {
description = "Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
variable "mongodb_secondary_message" {
description = "Custom message for MongoDB secondary monitor"
type = "string"
default = "last_5m"
default = ""
}
variable "mongodb_replication_message" {
description = "Custom message for MongoDB replication monitor"
type = "string"
default = ""
}
variable "mongodb_primary_aggregator" {
description = "Monitor aggregator for Mongodb primary state [available values: min, max]"
type = "string"
default = "max"
}
variable "mongodb_secondary_aggregator" {
description = "Monitor aggregator for Mongodb secondary state [available values: min, max]"
type = "string"
default = "max"
}
variable "mongodb_replication_aggregator" {
description = "Monitor aggregator for Mongodb replication lag [available values: min, max, sum or avg]"
type = "string"
default = "avg"
}

View File

@ -2,18 +2,17 @@ data "template_file" "filter" {
template = "$${filter}"
vars {
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_monitoring_mongodb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_mongodb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
}
}
resource "datadog_monitor" "mongodb_replicaset_state" {
name = "[${var.environment}] Member down in the replica set"
message = "${coalesce(var.mongodb_replicaset_message, var.message)}"
resource "datadog_monitor" "mongodb_primary" {
name = "[${var.environment}] MongoDB primary state"
message = "${coalesce(var.mongodb_primary_message, var.message)}"
query = <<EOF
avg(${var.mongodb_replicaset_timeframe}): (
avg:mongodb.replset.health{${data.template_file.filter.rendered}} by {region,replset_name}
) < 1
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
min:mongodb.replset.state{${data.template_file.filter.rendered}} by {replset_name} >= 2
EOF
type = "metric alert"
@ -27,7 +26,67 @@ resource "datadog_monitor" "mongodb_replicaset_state" {
include_tags = true
require_full_window = true
silenced = "${var.mongodb_replicaset_silenced}"
silenced = "${var.mongodb_primary_silenced}"
tags = ["env:${var.environment}", "resource:mongodb"]
}
resource "datadog_monitor" "mongodb_secondary" {
name = "[${var.environment}] MongoDB secondary state"
message = "${coalesce(var.mongodb_secondary_message, var.message)}"
query = <<EOF
${var.mongodb_secondary_aggregator}(${var.mongodb_secondary_timeframe}):
max:mongodb.replset.state{${data.template_file.filter.rendered},replset_state:secondary} by {server} >= 6
EOF
thresholds {
critical = 6
warning = 3
}
type = "metric alert"
notify_no_data = true
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = true
silenced = "${var.mongodb_secondary_silenced}"
tags = ["env:${var.environment}", "resource:mongodb"]
}
resource "datadog_monitor" "mongodb_replication" {
name = "[${var.environment}] MongoDB replication lag"
message = "${coalesce(var.mongodb_replication_message, var.message)}"
query = <<EOF
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
avg:mongodb.replset.replicationlag{${data.template_file.filter.rendered},replset_state:secondary} by {server} > ${var.mongodb_lag_critical}
EOF
thresholds {
critical = "${var.mongodb_lag_critical}"
warning = "${var.mongodb_lag_warning}"
}
type = "metric alert"
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
notify_audit = false
timeout_h = 0
include_tags = true
require_full_window = true
silenced = "${var.mongodb_replication_silenced}"
tags = ["env:${var.environment}", "resource:mongodb"]
}