MON-199 Advanced monitors for Mongo
This commit is contained in:
parent
453c31fc55
commit
70cd12309b
@ -1,24 +1,86 @@
|
||||
# DATABASES MONGODB DataDog monitors
|
||||
AWS MongoDB Service DataDog monitors
|
||||
==========================================
|
||||
|
||||
## How to use this module
|
||||
|
||||
|
||||
How to use this module
|
||||
----------------------
|
||||
|
||||
Add a user to MongoDB (on the primary instance) :
|
||||
|
||||
```
|
||||
module "datadog-monitors-databases-mongodb" {
|
||||
use admin
|
||||
db.auth("admin", "admin-password") ## This is optional if you don't have any admin password
|
||||
db.createUser({"user":"datadog", "pwd": "{{PASSWORD}}", "roles" : [ {role: 'read', db: 'admin' }, {role: 'clusterMonitor', db: 'admin'}, {role: 'read', db: 'local' }]})
|
||||
```
|
||||
|
||||
Add a module in your Terraform project :
|
||||
|
||||
```
|
||||
module "datadog-monitors-aws-mongodb" {
|
||||
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//databases/mongodb?ref={revision}"
|
||||
|
||||
environment = "${var.environment}"
|
||||
message = "${module.datadog-message-alerting.alerting-message}"
|
||||
}
|
||||
environment = "${var.environment}"
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
## Purpose
|
||||
Configure your Datadog agent for kubernetes with this config :
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
```
|
||||
datadog:
|
||||
confd:
|
||||
mongo.yaml: |-
|
||||
ad_identifiers:
|
||||
- mongodb
|
||||
init_config:
|
||||
instances:
|
||||
- server: mongodb://datadog:password@%%host%%/admin
|
||||
tags:
|
||||
- dd_monitoring:enabled
|
||||
- dd_mongodb:enabled
|
||||
- env:prod
|
||||
```
|
||||
|
||||
- Member down in the replica set
|
||||
|
||||
## Inputs
|
||||
Purpose
|
||||
-------
|
||||
|
||||
Creates a DataDog monitors with the following checks :
|
||||
* MongoDB Primary status
|
||||
* MongoDB Secondaries status
|
||||
* MongoDB replication lag
|
||||
|
||||
**Monitor MongoDB Primary**
|
||||
|
||||
Name: [environment] MongoDB Primary
|
||||
|
||||
This monitor will check the health of the Primary node
|
||||
|
||||
This monitor will trigger an alert if there's no primary or if the primary state is wrong.
|
||||
|
||||
|
||||
**Monitor MongoDB Secondary**
|
||||
|
||||
Name: [environment] MongoDB Secondary
|
||||
|
||||
This monitor will check the health for secondaries nodes
|
||||
|
||||
This monitor will trigger an alert if a secondary is missing or if there's a wrong state
|
||||
|
||||
|
||||
**Monitor MongoDB Replication lag**
|
||||
|
||||
Name: [environment] MongoDB Replication lag
|
||||
|
||||
This monitor will check the replication lag
|
||||
|
||||
This monitor will trigger an alert if the replication high is too high
|
||||
|
||||
|
||||
Inputs
|
||||
------
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
@ -27,63 +89,30 @@ Creates DataDog monitors with the following checks:
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| mongodb_replicaset_message | Custom message for Mongodb replicaset monitor | string | `` | no |
|
||||
| mongodb_replicaset_silenced | Groups to mute for Mongodb replicaset monitor | map | `<map>` | no |
|
||||
| mongodb_replicaset_time_aggregator | Monitor aggregator for Mongodb replicaset [available values: min, max or avg] | string | `max` | no |
|
||||
| mongodb_replicaset_timeframe | Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| mongodb_replicaset_state_id | id for monitor mongodb_replicaset_state |
|
||||
|
||||
## Related documentation
|
||||
|
||||
DataDog documentation: [https://docs.datadoghq.com/integrations/mongo/](https://docs.datadoghq.com/integrations/mongo/)
|
||||
|
||||
## Custom settings
|
||||
|
||||
### Prepare your ReplicaSet
|
||||
|
||||
Add a user to your ReplicaSet (on the primary instance)
|
||||
| mongodb_desired_servers_count | Number of servers that should be instanciated for this cluster | string | `3` | no |
|
||||
| mongodb_lag_critical | Critical replication lag in s | string | `5` | no |
|
||||
| mongodb_lag_warning | Warn replication lag in s | string | `2` | no |
|
||||
| mongodb_primary_aggregator | Monitor aggregator for MongoDB primary state [available values: min, max] | string | `max` | no |
|
||||
| mongodb_primary_message | Custom message for MongoDB primary monitor | string | `` | no |
|
||||
| mongodb_primary_silenced | Groups to mute for MongoDB primary state monitor | map | `<map>` | no |
|
||||
| mongodb_primary_timeframe | Monitor timeframe for MongoDB wrong state for primary node [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_1m` | no |
|
||||
| mongodb_replication_aggregator | Monitor aggregator for MongoDB replication lag [available values: min, max, sum or avg] | string | `avg` | no |
|
||||
| mongodb_replication_message | Custom message for MongoDB replication monitor | string | `` | no |
|
||||
| mongodb_replication_silenced | Groups to mute for MongoDB replication lag monitor | map | `<map>` | no |
|
||||
| mongodb_replication_timeframe | Monitor timeframe for MongoDB replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_1m` | no |
|
||||
| mongodb_secondary_aggregator | Monitor aggregator for MongoDB secondary state [available values: min, max] | string | `max` | no |
|
||||
| mongodb_secondary_message | Custom message for MongoDB secondary monitor | string | `` | no |
|
||||
| mongodb_secondary_silenced | Groups to mute for MongoDB secondary state monitor | map | `<map>` | no |
|
||||
| mongodb_secondary_timeframe | Monitor timeframe for MongoDB wrong state for secondaries nodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||
| mongodb_server_count_aggregator | Monitor aggregator for MongoDB server count [available values: min, max] | string | `min` | no |
|
||||
| mongodb_server_count_message | Custom message for MongoDB server count | string | `` | no |
|
||||
| mongodb_server_count_silenced | Groups to mute for MongoDB server count monitor | map | `<map>` | no |
|
||||
| mongodb_server_count_timeframe | Monitor timeframe for MongoDB wrong server count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||
|
||||
|
||||
```
|
||||
use admin
|
||||
db.auth("admin", "admin-password") ## This is optional is you don't have any admin password
|
||||
db.createUser({"user":"datadog", "pwd": "{{PASSWORD}}", "roles" : [ {role: 'read', db: 'admin' }, {role: 'clusterMonitor', db: 'admin'}, {role: 'read', db: 'local' }]})
|
||||
```
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
### Configure your Datadog agent
|
||||
[https://docs.datadoghq.com/integrations/mongo/](https://docs.datadoghq.com/integrations/mongo/)
|
||||
|
||||
Add this file conf.d/mongo.yaml
|
||||
|
||||
```
|
||||
|
||||
init_config:
|
||||
|
||||
instances:
|
||||
- server: mongodb://datadog:password@[MONGO_URI]
|
||||
tags:
|
||||
- mytag1
|
||||
- mytag2
|
||||
- server: mongodb://datadog:password@[MONGO_URI]
|
||||
tags:
|
||||
- mytag1
|
||||
- mytag2
|
||||
```
|
||||
|
||||
### Monitor ReplicaSet Health
|
||||
|
||||
Name: [environment] Replica Set heath for {{ replset_name }}
|
||||
|
||||
This monitor will check the health of your ReplicaSet
|
||||
|
||||
Metrics are :
|
||||
|
||||
1: The replicaSet is OK
|
||||
0: The replicaSet is KO
|
||||
|
||||
This monitor will trigger an alert for each ReplicaSet.
|
||||
|
||||
|
||||
@ -24,26 +24,84 @@ variable "filter_tags_custom" {
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "mongodb_replicaset_silenced" {
|
||||
description = "Groups to mute for Mongodb replicaset monitor"
|
||||
variable "mongodb_primary_timeframe" {
|
||||
description = "Monitor timeframe for MongoDB wrong state for primary node [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_1m"
|
||||
}
|
||||
|
||||
variable "mongodb_secondary_timeframe" {
|
||||
description = "Monitor timeframe for MongoDB wrong state for secondaries nodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_1m"
|
||||
}
|
||||
|
||||
variable "mongodb_replication_timeframe" {
|
||||
description = "Monitor timeframe for MongoDB replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_1m"
|
||||
}
|
||||
|
||||
variable "mongodb_lag_warning" {
|
||||
description = "Warn replication lag in s"
|
||||
default = 2
|
||||
}
|
||||
|
||||
variable "mongodb_lag_critical" {
|
||||
description = "Critical replication lag in s"
|
||||
default = 5
|
||||
}
|
||||
|
||||
variable "mongodb_primary_silenced" {
|
||||
description = "Groups to mute for Mongodb primary state monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "mongodb_replicaset_message" {
|
||||
description = "Custom message for Mongodb replicaset monitor"
|
||||
variable "mongodb_secondary_silenced" {
|
||||
description = "Groups to mute for Mongodb secondary state monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "mongodb_replication_silenced" {
|
||||
description = "Groups to mute for Mongodb replication lag monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "mongodb_primary_message" {
|
||||
description = "Custom message for MongoDB primary monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "mongodb_replicaset_time_aggregator" {
|
||||
description = "Monitor aggregator for Mongodb replicaset [available values: min, max or avg]"
|
||||
variable "mongodb_secondary_message" {
|
||||
description = "Custom message for MongoDB secondary monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "mongodb_replication_message" {
|
||||
description = "Custom message for MongoDB replication monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "mongodb_primary_aggregator" {
|
||||
description = "Monitor aggregator for Mongodb primary state [available values: min, max]"
|
||||
type = "string"
|
||||
default = "max"
|
||||
}
|
||||
|
||||
variable "mongodb_replicaset_timeframe" {
|
||||
description = "Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
variable "mongodb_secondary_aggregator" {
|
||||
description = "Monitor aggregator for Mongodb secondary state [available values: min, max]"
|
||||
type = "string"
|
||||
default = "last_5m"
|
||||
default = "max"
|
||||
}
|
||||
|
||||
variable "mongodb_replication_aggregator" {
|
||||
description = "Monitor aggregator for Mongodb replication lag [available values: min, max, sum or avg]"
|
||||
type = "string"
|
||||
default = "avg"
|
||||
}
|
||||
|
||||
@ -2,18 +2,17 @@ data "template_file" "filter" {
|
||||
template = "$${filter}"
|
||||
|
||||
vars {
|
||||
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_monitoring_mongodb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_mongodb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||
}
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "mongodb_replicaset_state" {
|
||||
name = "[${var.environment}] Member down in the replica set"
|
||||
message = "${coalesce(var.mongodb_replicaset_message, var.message)}"
|
||||
resource "datadog_monitor" "mongodb_primary" {
|
||||
name = "[${var.environment}] MongoDB primary state"
|
||||
message = "${coalesce(var.mongodb_primary_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.mongodb_replicaset_time_aggregator}(${var.mongodb_replicaset_timeframe}): (
|
||||
avg:mongodb.replset.health{${data.template_file.filter.rendered}} by {region,replset_name}
|
||||
) < 1
|
||||
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
|
||||
min:mongodb.replset.state{${data.template_file.filter.rendered}} by {replset_name} >= 2
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
@ -27,7 +26,101 @@ resource "datadog_monitor" "mongodb_replicaset_state" {
|
||||
include_tags = true
|
||||
require_full_window = true
|
||||
|
||||
silenced = "${var.mongodb_replicaset_silenced}"
|
||||
silenced = "${var.mongodb_primary_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:mongodb"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "mongodb_secondary" {
|
||||
name = "[${var.environment}] MongoDB secondary missing"
|
||||
message = "${coalesce(var.mongodb_secondary_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.mongodb_secondary_aggregator}(${var.mongodb_secondary_timeframe}):
|
||||
${var.mongodb_desired_servers_count} -
|
||||
sum:mongodb.replset.health{${data.template_file.filter.rendered}} by {replset_name}
|
||||
> 1
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
critical = 1
|
||||
warning = 0
|
||||
}
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = true
|
||||
|
||||
silenced = "${var.mongodb_secondary_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:mongodb"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "mongodb_server_count" {
|
||||
name = "[${var.environment}] MongoDB too much servers or wrong monitoring config"
|
||||
message = "${coalesce(var.mongodb_server_count_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
|
||||
sum:mongodb.replset.health{${data.template_file.filter.rendered}} by {replset_name}
|
||||
> 99
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
critical = 99
|
||||
warning = "${var.mongodb_desired_servers_count}"
|
||||
}
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = true
|
||||
|
||||
silenced = "${var.mongodb_secondary_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:mongodb"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "mongodb_replication" {
|
||||
name = "[${var.environment}] MongoDB replication lag"
|
||||
message = "${coalesce(var.mongodb_replication_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
|
||||
avg:mongodb.replset.replicationlag{${data.template_file.filter.rendered},replset_state:secondary} by {server} > ${var.mongodb_lag_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.mongodb_lag_critical}"
|
||||
warning = "${var.mongodb_lag_warning}"
|
||||
}
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = true
|
||||
|
||||
silenced = "${var.mongodb_replication_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:mongodb"]
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user