MON-547 add MySQL replication monitors

This commit is contained in:
Matthieu Bourgain 2019-12-24 14:00:04 +01:00 committed by Quentin Manfroi
parent dda61c2353
commit 0e71ff2506
4 changed files with 161 additions and 0 deletions

View File

@ -25,6 +25,8 @@ Creates DataDog monitors with the following checks:
- Mysql server does not respond - Mysql server does not respond
- Mysql Slow queries - Mysql Slow queries
- Mysql threads changed abnormally - Mysql threads changed abnormally
- Mysql replication lag
- Mysql replicagtion status
## Inputs ## Inputs
@ -102,6 +104,18 @@ Creates DataDog monitors with the following checks:
| mysql\_threads\_threshold\_critical | Maximum critical acceptable number of threads | string | `"1"` | no | | mysql\_threads\_threshold\_critical | Maximum critical acceptable number of threads | string | `"1"` | no |
| mysql\_threads\_time\_aggregator | Monitor time aggregator for MySQL threads monitor [available values: min, max or avg] | string | `"avg"` | no | | mysql\_threads\_time\_aggregator | Monitor time aggregator for MySQL threads monitor [available values: min, max or avg] | string | `"avg"` | no |
| mysql\_threads\_timeframe | Monitor timeframe for MySQL threads monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_4h"` | no | | mysql\_threads\_timeframe | Monitor timeframe for MySQL threads monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_4h"` | no |
| mysql\_replication\_lag\_enabled | Flag to enable mysql replication lag monitor | string | `"false"` | no |
| mysql\_replication\_lag\_extra\_tags | Extra tags for MySQL replication lag monitor | list(string) | `[]` | no |
| mysql\_replication\_lag\_message | Custom message for MySQL replication lag monitor | string | `""` | no |
| mysql\_replication\_lag\_threshold\_warning | Maximum warning acceptable seconds of replication lag | string | `"100"` | no |
| mysql\_replication\_lag\_threshold\_critical | Maximum critical acceptable seconds of replication lag | string | `"200"` | no |
| mysql\_replication\_lag\__time\_aggregator | Monitor time aggregator for MySQL replication lag monitor [available values: min, max or avg] | string | `"min"` | no |
| mysql\_replication\_lag\_timeframe | Monitor timeframe for MySQL replication lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| mysql\_replication\_status\_enabled | Flag to enable mysql replication status monitor | string | `"false"` | no |
| mysql\_replication\_status\_extra\_tags | Extra tags for MySQL replication status monitor | list(string) | `[]` | no |
| mysql\_replication\_status\_message | Custom message for MySQL replication status monitor | string | `""` | no |
| mysql\_replication\_status\__time\_aggregator | Monitor time aggregator for MySQL replication status monitor [available values: min, max or avg] | string | `"min"` | no |
| mysql\_replication\_status\_timeframe | Monitor timeframe for MySQL replication status monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| new\_host\_delay | Delay in seconds for the metric evaluation | string | `"300"` | no | | new\_host\_delay | Delay in seconds for the metric evaluation | string | `"300"` | no |
| notify\_no\_data | Will raise no data alert if set to true | string | `"true"` | no | | notify\_no\_data | Will raise no data alert if set to true | string | `"true"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
@ -118,6 +132,8 @@ Creates DataDog monitors with the following checks:
| mysql\_questions\_anomaly\_id | id for monitor mysql_questions_anomaly | | mysql\_questions\_anomaly\_id | id for monitor mysql_questions_anomaly |
| mysql\_slow\_id | id for monitor mysql_slow | | mysql\_slow\_id | id for monitor mysql_slow |
| mysql\_threads\_anomaly\_id | id for monitor mysql_threads_anomaly | | mysql\_threads\_anomaly\_id | id for monitor mysql_threads_anomaly |
| mysql\_replication\_lag\_id | id for monitor mysql_replication_lag |
| mysql\_replication\_status\_id | id for monitor mysql_replication_status |
## Related documentation ## Related documentation

View File

@ -460,3 +460,80 @@ variable "mysql_questions_timeframe" {
default = "last_4h" default = "last_4h"
} }
#################################
### MySQL replication lag ###
#################################
variable "mysql_replication_lag_enabled" {
description = "Flag to enable mysql replication lag monitor"
type = string
default = "false"
}
variable "mysql_replication_lag_extra_tags" {
description = "Extra tags for MySQL replication lag monitor"
type = list(string)
default = []
}
variable "mysql_replication_lag_message" {
description = "Custom message for MySQL replication lag monitor"
type = string
default = ""
}
variable "mysql_replication_lag_threshold_warning" {
default = 100
description = "Maximum warning acceptable seconds of replication lag"
}
variable "mysql_replication_lag_threshold_critical" {
default = 200
description = "Maximum critical acceptable seconds of replication lag"
}
variable "mysql_replication_lag_time_aggregator" {
description = "Monitor time aggregator for MySQL replication lag monitor [available values: min, max or avg]"
type = string
default = "min"
}
variable "mysql_replication_lag_timeframe" {
description = "Monitor timeframe for MySQL replication lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_15m"
}
###################################
### MySQL replication status ###
###################################
variable "mysql_replication_status_enabled" {
description = "Flag to enable mysql replication status monitor"
type = string
default = "false"
}
variable "mysql_replication_status_extra_tags" {
description = "Extra tags for MySQL replication status monitor"
type = list(string)
default = []
}
variable "mysql_replication_status_message" {
description = "Custom message for MySQL replication status monitor"
type = string
default = ""
}
variable "mysql_replication_status_time_aggregator" {
description = "Monitor time aggregator for MySQL replication status monitor [available values: min, max or avg]"
type = string
default = "min"
}
variable "mysql_replication_status_timeframe" {
description = "Monitor timeframe for MySQL replication status monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_5m"
}

View File

@ -288,3 +288,62 @@ EOQ
} }
} }
resource "datadog_monitor" "mysql_replication_lag" {
count = var.mysql_replication_lag_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Mysql replication lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = coalesce(var.mysql_replication_lag_message, var.message)
type = "query alert"
query = <<EOQ
${var.mysql_replication_lag_time_aggregator}(${var.mysql_replication_lag_timeframe}):avg:mysql.replication.seconds_behind_master${module.filter-tags.query_alert} by {server} > ${var.mysql_replication_lag_threshold_critical}
EOQ
thresholds = {
warning = var.mysql_replication_lag_threshold_warning
critical = var.mysql_replication_lag_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0
require_full_window = false
timeout_h = 0
include_tags = true
tags = concat(["env:${var.environment}", "type:database", "provider:mysql", "resource:mysql", "team:claranet", "created-by:terraform"], var.mysql_replication_lag_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}
resource "datadog_monitor" "mysql_replication_status" {
count = var.mysql_replication_status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Mysql replication status changed abnormally"
message = coalesce(var.mysql_replication_status_message, var.message)
type = "metric alert"
query = <<EOQ
${var.mysql_replication_status_time_aggregator}(${var.mysql_replication_status_timeframe}):avg:mysql.replication.slave_running${module.filter-tags.query_alert} by {server} < 1
EOQ
thresholds = {
critical = 1
}
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
tags = concat(["env:${var.environment}", "type:database", "provider:mysql", "resource:mysql", "team:claranet", "created-by:terraform"], var.mysql_replication_status_extra_tags)
lifecycle {
ignore_changes = [silenced]
}
}

View File

@ -38,3 +38,12 @@ output "mysql_threads_anomaly_id" {
value = datadog_monitor.mysql_threads_anomaly.*.id value = datadog_monitor.mysql_threads_anomaly.*.id
} }
output "mysql_replication_lag_id" {
description = "id for monitor mysql_replication_lag"
value = datadog_monitor.mysql_replication_lag.*.id
}
output "mysql_replication_status_id" {
description = "id for monitor mysql_replication_status"
value = datadog_monitor.mysql_replication_status.*.id
}