MON-224 CloudSQL MySQL Monitors for Queries and Questions Anomalies

This commit is contained in:
Rafael Romero Carmona 2018-06-15 16:53:23 +02:00 committed by Quentin Manfroi
parent cbc06ae0dc
commit 9233464195
3 changed files with 265 additions and 0 deletions

View File

@ -30,6 +30,7 @@ Useful links
* [Datadog Useful monitors for GCP CloudSQL](https://www.datadoghq.com/blog/monitor-google-cloud-sql/)
* [Max connections depends on the type of the instance](https://cloud.google.com/sql/docs/quotas#fixed-limits)
* [Monitoring Replication Lag](https://cloud.google.com/sql/docs/mysql/high-availability#replication-lag-monitor)
* [Monitoring MySQL Performance Metrics](https://www.datadoghq.com/blog/monitoring-mysql-performance-metrics)
Inputs
------
@ -48,6 +49,28 @@ Inputs
| network_connections_threshold_warning | Number of network connections (warning threshold) | string | `3200` | no |
| network_connections_timeframe | Timeframe for the Network Connections monitor | string | `last_5m` | no |
| project_id | ID of the GCP Project | string | - | yes |
| queries_changing_anomaly_detection_algorithm | Anomaly Detection Algorithm used | string | `robust` | no |
| queries_changing_database_ids | Queries Changing Abnormally | list | `<list>` | no |
| queries_changing_deviations | Deviations to detect the anomaly | string | `4` | no |
| queries_changing_direction | Direction of the anomaly. It can be both, below or above. | string | `both` | no |
| queries_changing_message | Custom message for the Queries Changing monitor | string | `` | no |
| queries_changing_region | | string | `` | no |
| queries_changing_seasonality | Seasonality of the algorithm | string | `weekly` | no |
| queries_changing_silenced | Groups to mute for GCP Cloud SQL Queries Changing monitor | map | `<map>` | no |
| queries_changing_threshold_critical | Queries Changing critical threshold | string | `1` | no |
| queries_changing_threshold_warning | Queries Changing warning threshold | string | `0.5` | no |
| queries_changing_timeframe | Timeframe for the Queries Changing mon monitor | string | `last_10m` | no |
| questions_changing_anomaly_detection_algorithm | Anomaly Detection Algorithm used | string | `robust` | no |
| questions_changing_database_ids | | list | `<list>` | no |
| questions_changing_deviations | Deviations to detect the anomaly | string | `4` | no |
| questions_changing_direction | Direction of the anomaly. It can be both, below or above. | string | `both` | no |
| questions_changing_message | Custom message for the Questions Changing monitor | string | `` | no |
| questions_changing_region | | string | `` | no |
| questions_changing_seasonality | Seasonality of the algorithm | string | `weekly` | no |
| questions_changing_silenced | Groups to mute for GCP Cloud SQL Network Connections monitor | map | `<map>` | no |
| questions_changing_threshold_critical | Questions Changing critical threshold | string | `1` | no |
| questions_changing_threshold_warning | Questions Changing warning threshold | string | `0.5` | no |
| questions_changing_timeframe | Timeframe for the Questions Changing mon monitor | string | `last_10m` | no |
| replication_lag_message | Custom message for the Replication Lag monitor | string | `` | no |
| replication_lag_silenced | Groups to mute for GCP Cloud SQL Replication Lag monitor | map | `<map>` | no |
| replication_lag_threshold_critical | Seconds behind the master (critical threshold) | string | `2700` | no |

View File

@ -104,3 +104,141 @@ variable "replication_lag_silenced" {
type = "map"
default = {}
}
#
# Queries Changing Abnormally
#
variable "queries_changing_database_ids" {
description = ""
type = "list"
default = []
}
variable "queries_changing_region" {
description = ""
type = "string"
default = ""
}
variable "queries_changing_message" {
description = "Custom message for the Queries Changing monitor"
type = "string"
default = ""
}
variable "queries_changing_timeframe" {
description = "Timeframe for the Queries Changing mon monitor"
type = "string"
default = "last_10m"
}
variable "queries_changing_anomaly_detection_algorithm" {
description = "Anomaly Detection Algorithm used"
type = "string"
default = "robust"
}
variable "queries_changing_deviations" {
description = "Deviations to detect the anomaly"
type = "string"
default = 4
}
variable "queries_changing_direction" {
description = "Direction of the anomaly. It can be both, below or above."
type = "string"
default = "both"
}
variable "queries_changing_seasonality" {
description = "Seasonality of the algorithm"
type = "string"
default = "weekly"
}
variable "queries_changing_threshold_warning" {
description = "Queries Changing warning threshold"
type = "string"
default = 0.5
}
variable "queries_changing_threshold_critical" {
description = "Queries Changing critical threshold"
type = "string"
default = 1
}
variable "queries_changing_silenced" {
description = "Groups to mute for GCP Cloud SQL Queries Changing monitor"
type = "map"
default = {}
}
#
# Questions Changing
#
variable "questions_changing_message" {
description = "Custom message for the Questions Changing monitor"
type = "string"
default = ""
}
variable "questions_changing_timeframe" {
description = "Timeframe for the Questions Changing mon monitor"
type = "string"
default = "last_10m"
}
variable "questions_changing_database_ids" {
description = ""
type = "list"
default = []
}
variable "questions_changing_region" {
description = ""
type = "string"
default = ""
}
variable "questions_changing_anomaly_detection_algorithm" {
description = "Anomaly Detection Algorithm used"
type = "string"
default = "robust"
}
variable "questions_changing_deviations" {
description = "Deviations to detect the anomaly"
type = "string"
default = 4
}
variable "questions_changing_direction" {
description = "Direction of the anomaly. It can be both, below or above."
type = "string"
default = "both"
}
variable "questions_changing_seasonality" {
description = "Seasonality of the algorithm"
type = "string"
default = "weekly"
}
variable "questions_changing_threshold_warning" {
description = "Questions Changing warning threshold"
type = "string"
default = 0.5
}
variable "questions_changing_threshold_critical" {
description = "Questions Changing critical threshold"
type = "string"
default = 1
}
variable "questions_changing_silenced" {
description = "Groups to mute for GCP Cloud SQL Network Connections monitor"
type = "map"
default = {}
}

View File

@ -94,3 +94,107 @@ EOF
"engine:mysql",
]
}
#
# Queries Anomaly
#
resource "datadog_monitor" "queries_changing_anomaly" {
count = "${length(var.queries_changing_database_ids)}"
name = "[${var.environment}] Cloud SQL MySQL Queries Count changed abnormally on ${var.project_id}:${var.queries_changing_region}:${var.queries_changing_database_ids[count.index]} {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.queries_changing_message, var.message)}"
type = "metric alert"
query = <<EOF
avg(${var.queries_changing_timeframe}):
anomalies(
default(
avg:gcp.cloudsql.database.mysql.queries{project_id:${var.project_id},database_id:${var.project_id}:${var.queries_changing_region}:${var.queries_changing_database_ids[count.index]}},
0),
'${var.queries_changing_anomaly_detection_algorithm}',
${var.queries_changing_deviations},
direction='${var.queries_changing_direction}',
seasonality='${var.queries_changing_seasonality}'
)
> ${var.queries_changing_threshold_critical}
EOF
thresholds {
warning = "${var.queries_changing_threshold_warning}"
critical = "${var.queries_changing_threshold_critical}"
}
include_tags = true
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.queries_changing_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"env:${var.environment}",
"resource:cloud-sql",
"engine:mysql",
"database_id:${var.project_id}:${var.queries_changing_region}:${var.queries_changing_database_ids[count.index]}}",
]
}
#
# Questions Anomaly
#
resource "datadog_monitor" "questions_changing_anomaly" {
count = "${length(var.questions_changing_database_ids)}"
name = "[${var.environment}] Cloud SQL MySQL Questions Count changed abnormally on ${var.project_id}:${var.questions_changing_region}:${var.questions_changing_database_ids[count.index]} {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.questions_changing_message, var.message)}"
type = "metric alert"
query = <<EOF
avg(${var.questions_changing_timeframe}):
anomalies(
default(
avg:gcp.cloudsql.database.mysql.questions{project_id:${var.project_id},database_id:${var.project_id}:${var.questions_changing_region}:${var.questions_changing_database_ids[count.index]}},
0),
'${var.questions_changing_anomaly_detection_algorithm}',
${var.questions_changing_deviations},
direction='${var.questions_changing_direction}',
seasonality='${var.questions_changing_seasonality}'
)
> ${var.questions_changing_threshold_critical}
EOF
thresholds {
warning = "${var.questions_changing_threshold_warning}"
critical = "${var.questions_changing_threshold_critical}"
}
include_tags = true
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.questions_changing_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"env:${var.environment}",
"resource:cloud-sql",
"engine:mysql",
"database_id:${var.project_id}:${var.questions_changing_region}:${var.questions_changing_database_ids[count.index]}",
]
}