MON-224 Monitors imported from the Actualys ones using the exported json

This commit is contained in:
Rafael Romero Carmona 2018-07-27 18:15:43 +02:00 committed by Quentin Manfroi
parent 2e5ac912c1
commit 54a996e8b7
4 changed files with 243 additions and 124 deletions

View File

@ -45,19 +45,19 @@ variable "cpu_utilization_message" {
variable "cpu_utilization_timeframe" { variable "cpu_utilization_timeframe" {
description = "Timeframe for the CPU Utilization monitor" description = "Timeframe for the CPU Utilization monitor"
type = "string" type = "string"
default = "last_30m" default = "last_15m"
} }
variable "cpu_utilization_threshold_warning" { variable "cpu_utilization_threshold_warning" {
description = "CPU Utilization in fraction (warning threshold)" description = "CPU Utilization in percentage (warning threshold)"
type = "string" type = "string"
default = 0.8 default = 80
} }
variable "cpu_utilization_threshold_critical" { variable "cpu_utilization_threshold_critical" {
description = "CPU Utilization in fraction (critical threshold)" description = "CPU Utilization in percentage (critical threshold)"
type = "string" type = "string"
default = 0.9 default = 90
} }
variable "cpu_utilization_silenced" { variable "cpu_utilization_silenced" {
@ -73,7 +73,7 @@ variable "cpu_utilization_extra_tags" {
} }
# #
# DISK # DISK Utilization
# #
variable "disk_utilization_message" { variable "disk_utilization_message" {
description = "Custom message for the Disk Utilization monitor" description = "Custom message for the Disk Utilization monitor"
@ -88,15 +88,15 @@ variable "disk_utilization_timeframe" {
} }
variable "disk_utilization_threshold_warning" { variable "disk_utilization_threshold_warning" {
description = "Disk Utilization in fraction (warning threshold)" description = "Disk Utilization in percentage (warning threshold)"
type = "string" type = "string"
default = 0.8 default = 80
} }
variable "disk_utilization_threshold_critical" { variable "disk_utilization_threshold_critical" {
description = "Disk Utilization in fraction (critical threshold)" description = "Disk Utilization in percentage (critical threshold)"
type = "string" type = "string"
default = 0.9 default = 90
} }
variable "disk_utilization_silenced" { variable "disk_utilization_silenced" {
@ -111,6 +111,45 @@ variable "disk_utilization_extra_tags" {
default = [] default = []
} }
#
# DISK Utilization Forecast
#
variable "disk_utilization_forecast_message" {
description = "Custom message for the Disk Utilization monitor"
type = "string"
default = ""
}
variable "disk_utilization_forecast_timeframe" {
description = "Timeframe for the Disk Utilization monitor"
type = "string"
default = "next_1w"
}
variable "disk_utilization_forecast_threshold_critical" {
description = "Disk Utilization in percentage (critical threshold)"
type = "string"
default = 80
}
variable "disk_utilization_forecast_threshold_critical_recovery" {
description = "Disk Utilization in percentage (recovery threshold)"
type = "string"
default = 72
}
variable "disk_utilization_forecast_silenced" {
description = "Groups to mute for GCP Cloud SQL Disk Utilization monitor"
type = "map"
default = {}
}
variable "disk_utilization_forecast_extra_tags" {
description = "Extra tags for GCP Cloud SQL CPU Utilization monitor"
type = "list"
default = []
}
# #
# Memory Utilization # Memory Utilization
# #
@ -125,13 +164,13 @@ variable "memory_utilization_timeframe" {
} }
variable "memory_utilization_threshold_warning" { variable "memory_utilization_threshold_warning" {
description = "Memory Utilization in fraction (warning threshold)" description = "Memory Utilization in percentage (warning threshold)"
default = 0.8 default = 80
} }
variable "memory_utilization_threshold_critical" { variable "memory_utilization_threshold_critical" {
description = "Memory Utilization in fraction (critical threshold)" description = "Memory Utilization in percentage (critical threshold)"
default = 0.9 default = 90
} }
variable "memory_utilization_silenced" { variable "memory_utilization_silenced" {
@ -169,14 +208,14 @@ variable "memory_utilization_forecast_history" {
default = "12h" default = "12h"
} }
variable "memory_utilization_forecast_threshold_warning" { variable "memory_utilization_forecast_threshold_critical" {
description = "Memory Utilization Forecast in fraction (warning threshold)" description = "Memory Utilization Forecast in percentage (warning threshold)"
default = 0.8 default = 90
} }
variable "memory_utilization_forecast_threshold_critical" { variable "memory_utilization_forecast_threshold_critical_recovery" {
description = "Memory Utilization Forecast in fraction (critical threshold)" description = "Memory Utilization Forecast in percentage (recovery threshold)"
default = 0.9 default = 81
} }
variable "memory_utilization_forecast_silenced" { variable "memory_utilization_forecast_silenced" {

View File

@ -15,15 +15,15 @@ data "template_file" "filter" {
# CPU Utilization # CPU Utilization
# #
resource "datadog_monitor" "cpu_utilization" { resource "datadog_monitor" "cpu_utilization" {
name = "[${var.environment}] Cloud SQL CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] Cloud SQL CPU utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_utilization_message, var.message)}" message = "${coalesce(var.cpu_utilization_message, var.message)}"
type = "metric alert" type = "metric alert"
query = <<EOF query = <<EOF
avg(${var.cpu_utilization_timeframe}): avg(${var.cpu_utilization_timeframe}):
avg:gcp.cloudsql.database.cpu.utilization{${data.template_file.filter.rendered}} avg:gcp.cloudsql.database.cpu.utilization{${data.template_file.filter.rendered}}
by {database_id} by {database_id} * 100
> ${var.cpu_utilization_threshold_critical} > ${var.cpu_utilization_threshold_critical}
EOF EOF
@ -32,16 +32,19 @@ EOF
critical = "${var.cpu_utilization_threshold_critical}" critical = "${var.cpu_utilization_threshold_critical}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false no_data_timeframe = 30
evaluation_delay = "${var.delay}" require_full_window = false
new_host_delay = "${var.delay}" notify_no_data = true
silenced = "${var.cpu_utilization_silenced}" renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.cpu_utilization_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -56,7 +59,7 @@ EOF
# Disk Utilization # Disk Utilization
# #
resource "datadog_monitor" "disk_utilization" { resource "datadog_monitor" "disk_utilization" {
name = "[${var.environment}] Cloud SQL Disk Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] Cloud SQL Disk utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.disk_utilization_message, var.message)}" message = "${coalesce(var.disk_utilization_message, var.message)}"
type = "metric alert" type = "metric alert"
@ -64,7 +67,7 @@ resource "datadog_monitor" "disk_utilization" {
query = <<EOF query = <<EOF
avg(${var.disk_utilization_timeframe}): avg(${var.disk_utilization_timeframe}):
avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}} avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}}
by {database_id} by {database_id} *100
> ${var.disk_utilization_threshold_critical} > ${var.disk_utilization_threshold_critical}
EOF EOF
@ -73,16 +76,19 @@ EOF
critical = "${var.disk_utilization_threshold_critical}" critical = "${var.disk_utilization_threshold_critical}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false no_data_timeframe = 20
evaluation_delay = "${var.delay}" require_full_window = false
new_host_delay = "${var.delay}" notify_no_data = true
silenced = "${var.disk_utilization_silenced}" renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.disk_utilization_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -93,6 +99,55 @@ EOF
] ]
} }
#
# Disk Utilization Forecast
#
resource "datadog_monitor" "disk_utilization_forecast" {
name = "[${var.environment}] Cloud SQL Disk utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future"
message = "${coalesce(var.disk_utilization_forecast_message, var.message)}"
type = "metric alert"
query = <<EOF
max(${var.disk_utilization_forecast_timeframe}):
forecast(
avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
'linear',
1,
interval='60m',
history='3d',
model='default'
)
>= ${var.disk_utilization_forecast_threshold_critical}
EOF
thresholds {
critical = "${var.disk_utilization_forecast_threshold_critical}"
critical_recovery = "${var.disk_utilization_forecast_threshold_critical_recovery}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.disk_utilization_forecast_silenced}"
tags = [
"team:gcp",
"provider:gcp",
"env:${var.environment}",
"resource:cloud-sql",
"${var.disk_utilization_forecast_extra_tags}",
]
}
# #
# Memory Utilization # Memory Utilization
# #
@ -105,8 +160,8 @@ resource "datadog_monitor" "memory_utilization" {
query = <<EOF query = <<EOF
avg(${var.memory_utilization_timeframe}): avg(${var.memory_utilization_timeframe}):
avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}} avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}}
by {database_id} by {database_id} * 100
> ${var.memory_utilization_threshold_critical} > ${var.memory_utilization_threshold_critical}
EOF EOF
thresholds { thresholds {
@ -114,16 +169,19 @@ EOF
critical = "${var.memory_utilization_threshold_critical}" critical = "${var.memory_utilization_threshold_critical}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false no_data_timeframe = 20
evaluation_delay = "${var.delay}" require_full_window = false
new_host_delay = "${var.delay}" notify_no_data = true
silenced = "${var.memory_utilization_silenced}" renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.memory_utilization_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -138,7 +196,7 @@ EOF
# Memory Utilization Forecast # Memory Utilization Forecast
# #
resource "datadog_monitor" "memory_utilization_forecast" { resource "datadog_monitor" "memory_utilization_forecast" {
name = "[${var.environment}] Cloud SQL Memory Utilization Forecast {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] Cloud SQL Memory Utilization could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future"
message = "${coalesce(var.memory_utilization_forecast_message, var.message)}" message = "${coalesce(var.memory_utilization_forecast_message, var.message)}"
type = "query alert" type = "query alert"
@ -146,31 +204,33 @@ resource "datadog_monitor" "memory_utilization_forecast" {
query = <<EOF query = <<EOF
max(${var.memory_utilization_forecast_timeframe}): max(${var.memory_utilization_forecast_timeframe}):
forecast( forecast(
avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}} by {database_id}, avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
'linear', 'linear',
1, 1,
interval='${var.memory_utilization_forecast_interval}', interval='${var.memory_utilization_forecast_interval}',
history='${var.memory_utilization_forecast_history}', history='${var.memory_utilization_forecast_history}',
model='default' model='default'
) )
> ${var.memory_utilization_forecast_threshold_critical} >= ${var.memory_utilization_forecast_threshold_critical}
EOF EOF
thresholds { thresholds {
warning = "${var.memory_utilization_forecast_threshold_warning}" critical = "${var.memory_utilization_forecast_threshold_critical}"
critical = "${var.memory_utilization_forecast_threshold_critical}" critical_recovery = "${var.memory_utilization_forecast_threshold_critical_recovery}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false require_full_window = false
evaluation_delay = "${var.delay}" notify_no_data = false
new_host_delay = "${var.delay}" renotify_interval = 0
silenced = "${var.memory_utilization_forecast_silenced}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.memory_utilization_forecast_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -185,32 +245,35 @@ EOF
# Failover Unavailable # Failover Unavailable
# #
resource "datadog_monitor" "failover_unavailable" { resource "datadog_monitor" "failover_unavailable" {
name = "[${var.environment}] Cloud SQL Failover Unavailable {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] Cloud SQL Failover Unavailable"
message = "${coalesce(var.failover_unavailable_message, var.message)}" message = "${coalesce(var.failover_unavailable_message, var.message)}"
type = "metric alert" type = "metric alert"
query = <<EOF query = <<EOF
max(${var.failover_unavailable_timeframe}): max(${var.failover_unavailable_timeframe}):
avg:gcp.cloudsql.database.available_for_failover{${data.template_file.filter.rendered}} avg:gcp.cloudsql.database.available_for_failover{${data.template_file.filter.rendered}}
by {database_id} by {database_id}
<= ${var.failover_unavailable_threshold_critical} <= ${var.failover_unavailable_threshold_critical}
EOF EOF
thresholds { thresholds {
critical = "${var.failover_unavailable_threshold_critical}" critical = "${var.failover_unavailable_threshold_critical}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false no_data_timeframe = 20
evaluation_delay = "${var.delay}" require_full_window = false
new_host_delay = "${var.delay}" notify_no_data = true
silenced = "${var.failover_unavailable_silenced}" renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.failover_unavailable_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",

View File

@ -96,13 +96,13 @@ variable "replication_lag_timeframe" {
variable "replication_lag_threshold_warning" { variable "replication_lag_threshold_warning" {
description = "Seconds behind the master (warning threshold)" description = "Seconds behind the master (warning threshold)"
type = "string" type = "string"
default = 300 default = 90
} }
variable "replication_lag_threshold_critical" { variable "replication_lag_threshold_critical" {
description = "Seconds behind the master (critical threshold)" description = "Seconds behind the master (critical threshold)"
type = "string" type = "string"
default = 900 default = 180
} }
variable "replication_lag_silenced" { variable "replication_lag_silenced" {
@ -135,13 +135,13 @@ variable "queries_changing_message" {
variable "queries_changing_timeframe" { variable "queries_changing_timeframe" {
description = "Timeframe for the Queries Changing mon monitor" description = "Timeframe for the Queries Changing mon monitor"
type = "string" type = "string"
default = "last_10m" default = "last_1h"
} }
variable "queries_changing_anomaly_detection_algorithm" { variable "queries_changing_anomaly_detection_algorithm" {
description = "Anomaly Detection Algorithm used" description = "Anomaly Detection Algorithm used"
type = "string" type = "string"
default = "robust" default = "agile"
} }
variable "queries_changing_deviations" { variable "queries_changing_deviations" {
@ -174,6 +174,12 @@ variable "queries_changing_threshold_critical" {
default = 1 default = 1
} }
variable "queries_changing_threshold_critical_recovery" {
description = "Queries Changing critical recovery threshold"
type = "string"
default = 0.99
}
variable "queries_changing_silenced" { variable "queries_changing_silenced" {
description = "Groups to mute for GCP Cloud SQL Queries Changing monitor" description = "Groups to mute for GCP Cloud SQL Queries Changing monitor"
type = "map" type = "map"
@ -198,7 +204,7 @@ variable "questions_changing_message" {
variable "questions_changing_timeframe" { variable "questions_changing_timeframe" {
description = "Timeframe for the Questions Changing monitor" description = "Timeframe for the Questions Changing monitor"
type = "string" type = "string"
default = "last_10m" default = "last_1h"
} }
variable "questions_changing_database_ids" { variable "questions_changing_database_ids" {
@ -210,7 +216,7 @@ variable "questions_changing_database_ids" {
variable "questions_changing_anomaly_detection_algorithm" { variable "questions_changing_anomaly_detection_algorithm" {
description = "Anomaly Detection Algorithm used" description = "Anomaly Detection Algorithm used"
type = "string" type = "string"
default = "robust" default = "agile"
} }
variable "questions_changing_deviations" { variable "questions_changing_deviations" {
@ -243,6 +249,12 @@ variable "questions_changing_threshold_critical" {
default = 1 default = 1
} }
variable "questions_changing_threshold_critical_recovery" {
description = "Questions Changing critical recovery threshold"
type = "string"
default = 0.99
}
variable "questions_changing_silenced" { variable "questions_changing_silenced" {
description = "Groups to mute for GCP Cloud SQL Questions Changing monitor" description = "Groups to mute for GCP Cloud SQL Questions Changing monitor"
type = "map" type = "map"

View File

@ -57,16 +57,16 @@ EOF
# Replication Lag # Replication Lag
# #
resource "datadog_monitor" "replication_lag" { resource "datadog_monitor" "replication_lag" {
name = "[${var.environment}] Cloud SQL MySQL Replication Lag too high" name = "[${var.environment}] Cloud SQL MySQL Replication Lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${coalesce(var.replication_lag_message, var.message)}" message = "${coalesce(var.replication_lag_message, var.message)}"
type = "metric alert" type = "metric alert"
query = <<EOF query = <<EOF
min(${var.replication_lag_timeframe}): min(${var.replication_lag_timeframe}):
avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}} avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}}
by {database_id} by {database_id}
> ${var.replication_lag_threshold_critical} > ${var.replication_lag_threshold_critical}
EOF EOF
thresholds { thresholds {
@ -74,16 +74,19 @@ EOF
warning = "${var.replication_lag_threshold_warning}" warning = "${var.replication_lag_threshold_warning}"
} }
notify_no_data = true
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false no_data_timeframe = 25
evaluation_delay = "${var.delay}" require_full_window = false
new_host_delay = "${var.delay}" notify_no_data = true
silenced = "${var.replication_lag_silenced}" renotify_interval = 0
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.replication_lag_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -99,9 +102,7 @@ EOF
# Queries Anomaly # Queries Anomaly
# #
resource "datadog_monitor" "queries_changing_anomaly" { resource "datadog_monitor" "queries_changing_anomaly" {
count = "${length(var.queries_changing_database_ids)}" name = "[${var.environment}] Cloud SQL MySQL Queries Count changed abnormally"
name = "[${var.environment}] [${var.queries_changing_database_ids[count.index]}] Cloud SQL MySQL Queries Count changed abnormally {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.queries_changing_message, var.message)}" message = "${coalesce(var.queries_changing_message, var.message)}"
type = "query alert" type = "query alert"
@ -109,32 +110,36 @@ resource "datadog_monitor" "queries_changing_anomaly" {
query = <<EOF query = <<EOF
avg(${var.queries_changing_timeframe}): avg(${var.queries_changing_timeframe}):
anomalies( anomalies(
default( avg:gcp.cloudsql.database.mysql.queries{${data.template_file.filter.rendered}} by {database_id}.as_count()
avg:gcp.cloudsql.database.mysql.queries{project_id:${var.project_id},database_id:${var.project_id}:${var.queries_changing_database_ids[count.index]}},
0),
'${var.queries_changing_anomaly_detection_algorithm}', '${var.queries_changing_anomaly_detection_algorithm}',
${var.queries_changing_deviations}, ${var.queries_changing_deviations},
direction='${var.queries_changing_direction}', direction='${var.queries_changing_direction}',
alert_window='last_30m',
interval=20,
count_default_zero='false',
seasonality='${var.queries_changing_seasonality}' seasonality='${var.queries_changing_seasonality}'
) )
> ${var.queries_changing_threshold_critical} > ${var.queries_changing_threshold_critical}
EOF EOF
thresholds { thresholds {
warning = "${var.queries_changing_threshold_warning}" warning = "${var.queries_changing_threshold_warning}"
critical = "${var.queries_changing_threshold_critical}" critical = "${var.queries_changing_threshold_critical}"
critical_recovery = "${var.queries_changing_threshold_critical_recovery}"
} }
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false require_full_window = false
evaluation_delay = "${var.delay}" notify_no_data = false
new_host_delay = "${var.delay}" renotify_interval = 0
silenced = "${var.queries_changing_silenced}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.queries_changing_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -142,7 +147,6 @@ EOF
"env:${var.environment}", "env:${var.environment}",
"resource:cloud-sql", "resource:cloud-sql",
"engine:mysql", "engine:mysql",
"database_id:${var.project_id}:${var.queries_changing_database_ids[count.index]}}",
"${var.queries_changing_extra_tags}", "${var.queries_changing_extra_tags}",
] ]
} }
@ -151,42 +155,44 @@ EOF
# Questions Anomaly # Questions Anomaly
# #
resource "datadog_monitor" "questions_changing_anomaly" { resource "datadog_monitor" "questions_changing_anomaly" {
count = "${length(var.questions_changing_database_ids)}" name = "[${var.environment}] Cloud SQL MySQL Questions Count changed abnormally"
name = "[${var.environment}] [${var.questions_changing_database_ids[count.index]}] Cloud SQL MySQL Questions Count changed abnormally {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.questions_changing_message, var.message)}" message = "${coalesce(var.questions_changing_message, var.message)}"
type = "query alert" type = "query alert"
query = <<EOF query = <<EOF
avg(${var.questions_changing_timeframe}): avg(last_1h):
anomalies( anomalies(
default( avg:gcp.cloudsql.database.mysql.questions{${data.template_file.filter.rendered}} by {database_id},
avg:gcp.cloudsql.database.mysql.questions{project_id:${var.project_id},database_id:${var.project_id}:${var.questions_changing_database_ids[count.index]}},
0),
'${var.questions_changing_anomaly_detection_algorithm}', '${var.questions_changing_anomaly_detection_algorithm}',
${var.questions_changing_deviations}, ${var.questions_changing_deviations},
direction='${var.questions_changing_direction}', direction='${var.questions_changing_direction}',
alert_window='last_30m',
interval=20,
count_default_zero='false',
seasonality='${var.questions_changing_seasonality}' seasonality='${var.questions_changing_seasonality}'
) )
> ${var.questions_changing_threshold_critical} > ${var.questions_changing_threshold_critical}
EOF EOF
thresholds { thresholds {
warning = "${var.questions_changing_threshold_warning}" warning = "${var.questions_changing_threshold_warning}"
critical = "${var.questions_changing_threshold_critical}" critical = "${var.questions_changing_threshold_critical}"
critical_recovery = "${var.questions_changing_threshold_critical_recovery}"
} }
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false notify_audit = false
locked = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false require_full_window = false
evaluation_delay = "${var.delay}" notify_no_data = false
new_host_delay = "${var.delay}" renotify_interval = 0
silenced = "${var.questions_changing_silenced}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
silenced = "${var.questions_changing_silenced}"
tags = [ tags = [
"team:gcp", "team:gcp",
@ -194,7 +200,6 @@ EOF
"env:${var.environment}", "env:${var.environment}",
"resource:cloud-sql", "resource:cloud-sql",
"engine:mysql", "engine:mysql",
"database_id:${var.project_id}:${var.questions_changing_database_ids[count.index]}",
"${var.questions_changing_extra_tags}", "${var.questions_changing_extra_tags}",
] ]
} }