MON-224 CloudSQL MySQL Replication Lag monitor

This commit is contained in:
Rafael Romero Carmona 2018-06-15 11:25:10 +02:00 committed by Quentin Manfroi
parent 79f8a5d486
commit 1673d8bbce
3 changed files with 81 additions and 0 deletions

View File

@ -29,6 +29,7 @@ Useful links
* [GCP Metrics for CloudSQL](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql) * [GCP Metrics for CloudSQL](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql)
* [Datadog Useful monitors for GCP CloudSQL](https://www.datadoghq.com/blog/monitor-google-cloud-sql/) * [Datadog Useful monitors for GCP CloudSQL](https://www.datadoghq.com/blog/monitor-google-cloud-sql/)
* [Max connections depends on the type of the instance](https://cloud.google.com/sql/docs/quotas#fixed-limits) * [Max connections depends on the type of the instance](https://cloud.google.com/sql/docs/quotas#fixed-limits)
* [Monitoring Replication Lag](https://cloud.google.com/sql/docs/mysql/high-availability#replication-lag-monitor)
Inputs Inputs
------ ------
@ -47,3 +48,8 @@ Inputs
| network_connections_threshold_warning | Number of network connections (warning threshold) | string | `3200` | no | | network_connections_threshold_warning | Number of network connections (warning threshold) | string | `3200` | no |
| network_connections_timeframe | Timeframe for the Network Connections monitor | string | `last_5m` | no | | network_connections_timeframe | Timeframe for the Network Connections monitor | string | `last_5m` | no |
| project_id | ID of the GCP Project | string | - | yes | | project_id | ID of the GCP Project | string | - | yes |
| replication_lag_message | Custom message for the Replication Lag monitor | string | `` | no |
| replication_lag_silenced | Groups to mute for GCP Cloud SQL Replication Lag monitor | map | `<map>` | no |
| replication_lag_threshold_critical | Seconds behind the master (critical threshold) | string | `2700` | no |
| replication_lag_threshold_warning | Seconds behind the master (warning threshold) | string | `2000` | no |
| replication_lag_timeframe | Timeframe for the Replication Lag monitor | string | `last_10m` | no |

View File

@ -71,3 +71,36 @@ variable "network_connections_silenced" {
type = "map" type = "map"
default = {} default = {}
} }
#
# Replication Lag
#
variable "replication_lag_message" {
description = "Custom message for the Replication Lag monitor"
type = "string"
default = ""
}
variable "replication_lag_timeframe" {
description = "Timeframe for the Replication Lag monitor"
type = "string"
default = "last_10m"
}
variable "replication_lag_threshold_warning" {
description = "Seconds behind the master (warning threshold)"
type = "string"
default = 2000
}
variable "replication_lag_threshold_critical" {
description = "Seconds behind the master (critical threshold)"
type = "string"
default = 2700
}
variable "replication_lag_silenced" {
description = "Groups to mute for GCP Cloud SQL Replication Lag monitor"
type = "map"
default = {}
}

View File

@ -52,3 +52,45 @@ EOF
"engine:mysql", "engine:mysql",
] ]
} }
#
# Replication Lag
#
resource "datadog_monitor" "datadog_monitor_cloud_sql_mysql_replication_lag" {
name = "[${var.environment}] Cloud SQL MySQL Replication Lag too high"
message = "${coalesce(var.replication_lag_message, var.message)}"
type = "metric alert"
query = <<EOF
min(last_10m):
avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}}
by {database_id}
> ${var.replication_lag_threshold_critical}
EOF
thresholds {
critical = "${var.replication_lag_threshold_critical}"
warning = "${var.replication_lag_threshold_warning}"
}
include_tags = true
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.questions_changing_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"env:${var.environment}",
"resource:cloud-sql",
"engine:mysql",
]
}