MON-224 time aggregator for all monitors

This commit is contained in:
Rafael Romero Carmona 2018-08-02 11:31:55 +02:00 committed by Quentin Manfroi
parent 82d3ee2f91
commit 4c4c24a34f
6 changed files with 70 additions and 9 deletions

View File

@ -32,6 +32,7 @@ Creates DataDog monitors with the following checks:
| cpu_utilization_silenced | Groups to mute for GCP Cloud SQL CPU Utilization monitor | map | `<map>` | no |
| cpu_utilization_threshold_critical | CPU Utilization in percentage (critical threshold) | string | `90` | no |
| cpu_utilization_threshold_warning | CPU Utilization in percentage (warning threshold) | string | `80` | no |
| cpu_utilization_time_aggregator | Time aggregator for the CPU Utilization monitor | string | `avg` | no |
| cpu_utilization_timeframe | Timeframe for the CPU Utilization monitor | string | `last_15m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| disk_utilization_extra_tags | Extra tags for GCP Cloud SQL CPU Utilization monitor | list | `<list>` | no |
@ -46,17 +47,20 @@ Creates DataDog monitors with the following checks:
| disk_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Disk Utilization Forecast monitor | map | `<map>` | no |
| disk_utilization_forecast_threshold_critical | Disk Utilization Forecast in percentage (critical threshold) | string | `80` | no |
| disk_utilization_forecast_threshold_critical_recovery | Disk Utilization Forecast in percentage (recovery threshold) | string | `72` | no |
| disk_utilization_forecast_time_aggregator | Time aggregator for the Disk Utilization Forecast monitor | string | `max` | no |
| disk_utilization_forecast_timeframe | Timeframe for the Disk Utilization Forecast monitor | string | `next_1w` | no |
| disk_utilization_message | Custom message for the Disk Utilization monitor | string | `` | no |
| disk_utilization_silenced | Groups to mute for GCP Cloud SQL Disk Utilization monitor | map | `<map>` | no |
| disk_utilization_threshold_critical | Disk Utilization in percentage (critical threshold) | string | `90` | no |
| disk_utilization_threshold_warning | Disk Utilization in percentage (warning threshold) | string | `80` | no |
| disk_utilization_time_aggregator | Time aggregator for the Disk Utilization monitor | string | `avg` | no |
| disk_utilization_timeframe | Timeframe for the Disk Utilization monitor | string | `last_5m` | no |
| environment | Architecture environment | string | - | yes |
| failover_unavailable_extra_tags | Extra tags for GCP Cloud SQL Failover Unavailable monitor | list | `<list>` | no |
| failover_unavailable_message | Custom message for the Failover Unavailable monitor | string | `` | no |
| failover_unavailable_silenced | Groups to mute for GCP Cloud SQL Failover Unavailable monitor | map | `<map>` | no |
| failover_unavailable_threshold_critical | Failover Unavailable critical threshold | string | `0` | no |
| failover_unavailable_time_aggregator | Time aggreggator for the Failover Unavailable monitor | string | `max` | no |
| failover_unavailable_timeframe | Timeframe for the Failover Unavailable monitor | string | `last_5m` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
@ -72,11 +76,13 @@ Creates DataDog monitors with the following checks:
| memory_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Memory Utilization Forecast monitor | map | `<map>` | no |
| memory_utilization_forecast_threshold_critical | Memory Utilization Forecast in percentage (warning threshold) | string | `90` | no |
| memory_utilization_forecast_threshold_critical_recovery | Memory Utilization Forecast in percentage (recovery threshold) | string | `81` | no |
| memory_utilization_forecast_time_aggregator | Time aggregator for the Memory Utilization Forecast monitor | string | `max` | no |
| memory_utilization_forecast_timeframe | Timeframe for the Memory Utilization Forecast monitor | string | `next_3d` | no |
| memory_utilization_message | Custom message for the Memory Utilization monitor | string | `` | no |
| memory_utilization_silenced | Groups to mute for GCP Cloud SQL Memory Utilization monitor | map | `<map>` | no |
| memory_utilization_threshold_critical | Memory Utilization in percentage (critical threshold) | string | `90` | no |
| memory_utilization_threshold_warning | Memory Utilization in percentage (warning threshold) | string | `80` | no |
| memory_utilization_time_aggregator | Time aggregator for the Memory Utilization monitor | string | `avg` | no |
| memory_utilization_timeframe | Timeframe for the Memory Utilization monitor | string | `last_5m` | no |
| message | Message sent when a monitor is triggered | string | - | yes |
| project_id | ID of the GCP Project | string | - | yes |

View File

@ -43,6 +43,12 @@ variable "cpu_utilization_message" {
default = ""
}
variable "cpu_utilization_time_aggregator" {
description = "Time aggregator for the CPU Utilization monitor"
type = "string"
default = "avg"
}
variable "cpu_utilization_timeframe" {
description = "Timeframe for the CPU Utilization monitor"
type = "string"
@ -83,6 +89,12 @@ variable "disk_utilization_message" {
default = ""
}
variable "disk_utilization_time_aggregator" {
description = "Time aggregator for the Disk Utilization monitor"
type = "string"
default = "avg"
}
variable "disk_utilization_timeframe" {
description = "Timeframe for the Disk Utilization monitor"
type = "string"
@ -123,6 +135,12 @@ variable "disk_utilization_forecast_message" {
default = ""
}
variable "disk_utilization_forecast_time_aggregator" {
description = "Time aggregator for the Disk Utilization Forecast monitor"
type = "string"
default = "max"
}
variable "disk_utilization_forecast_timeframe" {
description = "Timeframe for the Disk Utilization Forecast monitor"
type = "string"
@ -198,6 +216,11 @@ variable "memory_utilization_message" {
default = ""
}
variable "memory_utilization_time_aggregator" {
description = "Time aggregator for the Memory Utilization monitor"
default = "avg"
}
variable "memory_utilization_timeframe" {
description = "Timeframe for the Memory Utilization monitor"
default = "last_5m"
@ -234,6 +257,11 @@ variable "memory_utilization_forecast_message" {
default = ""
}
variable "memory_utilization_forecast_time_aggregator" {
description = "Time aggregator for the Memory Utilization Forecast monitor"
default = "max"
}
variable "memory_utilization_forecast_timeframe" {
description = "Timeframe for the Memory Utilization Forecast monitor"
default = "next_3d"
@ -307,6 +335,12 @@ variable "failover_unavailable_message" {
default = ""
}
variable "failover_unavailable_time_aggregator" {
description = "Time aggreggator for the Failover Unavailable monitor"
type = "string"
default = "max"
}
variable "failover_unavailable_timeframe" {
description = "Timeframe for the Failover Unavailable monitor"
type = "string"

View File

@ -21,7 +21,7 @@ resource "datadog_monitor" "cpu_utilization" {
type = "metric alert"
query = <<EOF
avg(${var.cpu_utilization_timeframe}):
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
avg:gcp.cloudsql.database.cpu.utilization{${data.template_file.filter.rendered}}
by {database_id} * 100
> ${var.cpu_utilization_threshold_critical}
@ -66,7 +66,7 @@ resource "datadog_monitor" "disk_utilization" {
type = "metric alert"
query = <<EOF
avg(${var.disk_utilization_timeframe}):
${var.disk_utilization_time_aggregator}(${var.disk_utilization_timeframe}):
avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}}
by {database_id} * 100
> ${var.disk_utilization_threshold_critical}
@ -111,7 +111,7 @@ resource "datadog_monitor" "disk_utilization_forecast" {
type = "metric alert"
query = <<EOF
max(${var.disk_utilization_forecast_timeframe}):
${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
forecast(
avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
'${var.disk_utilization_forecast_algorithm}',
@ -161,7 +161,7 @@ resource "datadog_monitor" "memory_utilization" {
type = "metric alert"
query = <<EOF
avg(${var.memory_utilization_timeframe}):
${var.memory_utilization_time_aggregator}(${var.memory_utilization_timeframe}):
avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}}
by {database_id} * 100
> ${var.memory_utilization_threshold_critical}
@ -206,7 +206,7 @@ resource "datadog_monitor" "memory_utilization_forecast" {
type = "query alert"
query = <<EOF
max(${var.memory_utilization_forecast_timeframe}):
${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
forecast(
avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
'${var.memory_utilization_forecast_algorithm}',
@ -256,7 +256,7 @@ resource "datadog_monitor" "failover_unavailable" {
type = "metric alert"
query = <<EOF
max(${var.failover_unavailable_timeframe}):
${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
avg:gcp.cloudsql.database.available_for_failover{${data.template_file.filter.rendered}}
by {database_id}
<= ${var.failover_unavailable_threshold_critical}

View File

@ -43,6 +43,7 @@ Creates DataDog monitors with the following checks:
| queries_changing_anomaly_threshold_critical | Queries Changing critical threshold | string | `1` | no |
| queries_changing_anomaly_threshold_critical_recovery | Queries Changing critical recovery threshold | string | `0.99` | no |
| queries_changing_anomaly_threshold_warning | Queries Changing warning threshold | string | `0.5` | no |
| queries_changing_anomaly_time_aggregator | Time aggregator for the Queries Changing mon monitor | string | `avg` | no |
| queries_changing_anomaly_timeframe | Timeframe for the Queries Changing mon monitor | string | `last_1h` | no |
| questions_changing_anomaly_alert_window | Alert window. | string | `last_30m` | no |
| questions_changing_anomaly_count_default_zero | Count default zero. | string | `false` | no |
@ -57,12 +58,14 @@ Creates DataDog monitors with the following checks:
| questions_changing_anomaly_threshold_critical | Questions Changing critical threshold | string | `1` | no |
| questions_changing_anomaly_threshold_critical_recovery | Questions Changing critical recovery threshold | string | `0.99` | no |
| questions_changing_anomaly_threshold_warning | Questions Changing warning threshold | string | `0.5` | no |
| questions_changing_anomaly_time_aggregator | Time aggregator for the Questions Changing monitor | string | `avg` | no |
| questions_changing_anomaly_timeframe | Timeframe for the Questions Changing monitor | string | `last_1h` | no |
| replication_lag_extra_tags | Extra tags for GCP Cloud SQL SQL Replication monitor | list | `<list>` | no |
| replication_lag_message | Custom message for the Replication Lag monitor | string | `` | no |
| replication_lag_silenced | Groups to mute for GCP Cloud SQL Replication Lag monitor | map | `<map>` | no |
| replication_lag_threshold_critical | Seconds behind the master (critical threshold) | string | `180` | no |
| replication_lag_threshold_warning | Seconds behind the master (warning threshold) | string | `90` | no |
| replication_lag_time_aggregator | Time aggregator for the Replication Lag monitor | string | `min` | no |
| replication_lag_timeframe | Timeframe for the Replication Lag monitor | string | `last_10m` | no |
## Outputs

View File

@ -43,6 +43,12 @@ variable "replication_lag_message" {
default = ""
}
variable "replication_lag_time_aggregator" {
description = "Time aggregator for the Replication Lag monitor"
type = "string"
default = "min"
}
variable "replication_lag_timeframe" {
description = "Timeframe for the Replication Lag monitor"
type = "string"
@ -83,6 +89,12 @@ variable "queries_changing_anomaly_message" {
default = ""
}
variable "queries_changing_anomaly_time_aggregator" {
description = "Time aggregator for the Queries Changing mon monitor"
type = "string"
default = "avg"
}
variable "queries_changing_anomaly_timeframe" {
description = "Timeframe for the Queries Changing mon monitor"
type = "string"
@ -171,6 +183,12 @@ variable "questions_changing_anomaly_message" {
default = ""
}
variable "questions_changing_anomaly_time_aggregator" {
description = "Time aggregator for the Questions Changing monitor"
type = "string"
default = "avg"
}
variable "questions_changing_anomaly_timeframe" {
description = "Timeframe for the Questions Changing monitor"
type = "string"

View File

@ -21,7 +21,7 @@ resource "datadog_monitor" "replication_lag" {
type = "metric alert"
query = <<EOF
min(${var.replication_lag_timeframe}):
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}):
avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}}
by {database_id}
> ${var.replication_lag_threshold_critical}
@ -67,7 +67,7 @@ resource "datadog_monitor" "queries_changing_anomaly" {
type = "query alert"
query = <<EOF
avg(${var.queries_changing_anomaly_timeframe}):
${var.queries_changing_anomaly_time_aggregator}(${var.queries_changing_anomaly_timeframe}):
anomalies(
avg:gcp.cloudsql.database.mysql.queries{${data.template_file.filter.rendered}} by {database_id}.as_count(),
'${var.queries_changing_anomaly_detection_algorithm}',
@ -121,7 +121,7 @@ resource "datadog_monitor" "questions_changing_anomaly" {
type = "query alert"
query = <<EOF
avg(${var.questions_changing_anomaly_timeframe}):
${var.questions_changing_anomaly_time_aggregator}(${var.questions_changing_anomaly_timeframe}):
anomalies(
avg:gcp.cloudsql.database.mysql.questions{${data.template_file.filter.rendered}} by {database_id},
'${var.questions_changing_anomaly_detection_algorithm}',