MON-224 time aggregator for all monitors

2018-08-02 11:31:55 +02:00 · 2018-08-02 11:31:55 +02:00 · 4c4c24a34f
commit 4c4c24a34f
parent 82d3ee2f91
6 changed files with 70 additions and 9 deletions
--- a/cloud/gcp/cloud-sql/instance/README.md
+++ b/cloud/gcp/cloud-sql/instance/README.md
@ -32,6 +32,7 @@ Creates DataDog monitors with the following checks:
 | cpu_utilization_silenced | Groups to mute for GCP Cloud SQL CPU Utilization monitor | map | `<map>` | no |
 | cpu_utilization_threshold_critical | CPU Utilization in percentage (critical threshold) | string | `90` | no |
 | cpu_utilization_threshold_warning | CPU Utilization in percentage (warning threshold) | string | `80` | no |
+| cpu_utilization_time_aggregator | Time aggregator for the CPU Utilization monitor | string | `avg` | no |
 | cpu_utilization_timeframe | Timeframe for the CPU Utilization monitor | string | `last_15m` | no |
 | delay | Delay in seconds for the metric evaluation | string | `900` | no |
 | disk_utilization_extra_tags | Extra tags for GCP Cloud SQL CPU Utilization monitor | list | `<list>` | no |
@ -46,17 +47,20 @@ Creates DataDog monitors with the following checks:
 | disk_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Disk Utilization Forecast monitor | map | `<map>` | no |
 | disk_utilization_forecast_threshold_critical | Disk Utilization Forecast in percentage (critical threshold) | string | `80` | no |
 | disk_utilization_forecast_threshold_critical_recovery | Disk Utilization Forecast in percentage (recovery threshold) | string | `72` | no |
+| disk_utilization_forecast_time_aggregator | Time aggregator for the Disk Utilization Forecast monitor | string | `max` | no |
 | disk_utilization_forecast_timeframe | Timeframe for the Disk Utilization Forecast monitor | string | `next_1w` | no |
 | disk_utilization_message | Custom message for the Disk Utilization monitor | string | `` | no |
 | disk_utilization_silenced | Groups to mute for GCP Cloud SQL Disk Utilization monitor | map | `<map>` | no |
 | disk_utilization_threshold_critical | Disk Utilization in percentage (critical threshold) | string | `90` | no |
 | disk_utilization_threshold_warning | Disk Utilization in percentage (warning threshold) | string | `80` | no |
+| disk_utilization_time_aggregator | Time aggregator for the Disk Utilization monitor | string | `avg` | no |
 | disk_utilization_timeframe | Timeframe for the Disk Utilization monitor | string | `last_5m` | no |
 | environment | Architecture environment | string | - | yes |
 | failover_unavailable_extra_tags | Extra tags for GCP Cloud SQL Failover Unavailable monitor | list | `<list>` | no |
 | failover_unavailable_message | Custom message for the Failover Unavailable monitor | string | `` | no |
 | failover_unavailable_silenced | Groups to mute for GCP Cloud SQL Failover Unavailable monitor | map | `<map>` | no |
 | failover_unavailable_threshold_critical | Failover Unavailable critical threshold | string | `0` | no |
+| failover_unavailable_time_aggregator | Time aggreggator for the Failover Unavailable monitor | string | `max` | no |
 | failover_unavailable_timeframe | Timeframe for the Failover Unavailable monitor | string | `last_5m` | no |
 | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
@ -72,11 +76,13 @@ Creates DataDog monitors with the following checks:
 | memory_utilization_forecast_silenced | Groups to mute for GCP Cloud SQL Memory Utilization Forecast monitor | map | `<map>` | no |
 | memory_utilization_forecast_threshold_critical | Memory Utilization Forecast in percentage (warning threshold) | string | `90` | no |
 | memory_utilization_forecast_threshold_critical_recovery | Memory Utilization Forecast in percentage (recovery threshold) | string | `81` | no |
+| memory_utilization_forecast_time_aggregator | Time aggregator for the Memory Utilization Forecast monitor | string | `max` | no |
 | memory_utilization_forecast_timeframe | Timeframe for the Memory Utilization Forecast monitor | string | `next_3d` | no |
 | memory_utilization_message | Custom message for the Memory Utilization monitor | string | `` | no |
 | memory_utilization_silenced | Groups to mute for GCP Cloud SQL Memory Utilization monitor | map | `<map>` | no |
 | memory_utilization_threshold_critical | Memory Utilization in percentage (critical threshold) | string | `90` | no |
 | memory_utilization_threshold_warning | Memory Utilization in percentage (warning threshold) | string | `80` | no |
+| memory_utilization_time_aggregator | Time aggregator for the Memory Utilization monitor | string | `avg` | no |
 | memory_utilization_timeframe | Timeframe for the Memory Utilization monitor | string | `last_5m` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
 | project_id | ID of the GCP Project | string | - | yes |
--- a/cloud/gcp/cloud-sql/instance/inputs.tf
+++ b/cloud/gcp/cloud-sql/instance/inputs.tf
@ -43,6 +43,12 @@ variable "cpu_utilization_message" {
  default     = ""
 }

+variable "cpu_utilization_time_aggregator" {
+  description = "Time aggregator for the CPU Utilization monitor"
+  type        = "string"
+  default     = "avg"
+}
+
 variable "cpu_utilization_timeframe" {
  description = "Timeframe for the CPU Utilization monitor"
  type        = "string"
@ -83,6 +89,12 @@ variable "disk_utilization_message" {
  default     = ""
 }

+variable "disk_utilization_time_aggregator" {
+  description = "Time aggregator for the Disk Utilization monitor"
+  type        = "string"
+  default     = "avg"
+}
+
 variable "disk_utilization_timeframe" {
  description = "Timeframe for the Disk Utilization monitor"
  type        = "string"
@ -123,6 +135,12 @@ variable "disk_utilization_forecast_message" {
  default     = ""
 }

+variable "disk_utilization_forecast_time_aggregator" {
+  description = "Time aggregator for the Disk Utilization Forecast monitor"
+  type        = "string"
+  default     = "max"
+}
+
 variable "disk_utilization_forecast_timeframe" {
  description = "Timeframe for the Disk Utilization Forecast monitor"
  type        = "string"
@ -198,6 +216,11 @@ variable "memory_utilization_message" {
  default     = ""
 }

+variable "memory_utilization_time_aggregator" {
+  description = "Time aggregator for the Memory Utilization monitor"
+  default     = "avg"
+}
+
 variable "memory_utilization_timeframe" {
  description = "Timeframe for the Memory Utilization monitor"
  default     = "last_5m"
@ -234,6 +257,11 @@ variable "memory_utilization_forecast_message" {
  default     = ""
 }

+variable "memory_utilization_forecast_time_aggregator" {
+  description = "Time aggregator for the Memory Utilization Forecast monitor"
+  default     = "max"
+}
+
 variable "memory_utilization_forecast_timeframe" {
  description = "Timeframe for the Memory Utilization Forecast monitor"
  default     = "next_3d"
@ -307,6 +335,12 @@ variable "failover_unavailable_message" {
  default     = ""
 }

+variable "failover_unavailable_time_aggregator" {
+  description = "Time aggreggator for the Failover Unavailable monitor"
+  type        = "string"
+  default     = "max"
+}
+
 variable "failover_unavailable_timeframe" {
  description = "Timeframe for the Failover Unavailable monitor"
  type        = "string"
--- a/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf
+++ b/cloud/gcp/cloud-sql/instance/monitors-cloud-sql-instance.tf
@ -21,7 +21,7 @@ resource "datadog_monitor" "cpu_utilization" {
  type = "metric alert"

  query = <<EOF
-  avg(${var.cpu_utilization_timeframe}):
+  ${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
    avg:gcp.cloudsql.database.cpu.utilization{${data.template_file.filter.rendered}}
    by {database_id} * 100
  > ${var.cpu_utilization_threshold_critical}
@ -66,7 +66,7 @@ resource "datadog_monitor" "disk_utilization" {
  type = "metric alert"

  query = <<EOF
-  avg(${var.disk_utilization_timeframe}):
+  ${var.disk_utilization_time_aggregator}(${var.disk_utilization_timeframe}):
    avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}}
    by {database_id} * 100
    > ${var.disk_utilization_threshold_critical}
@ -111,7 +111,7 @@ resource "datadog_monitor" "disk_utilization_forecast" {
  type = "metric alert"

  query = <<EOF
-  max(${var.disk_utilization_forecast_timeframe}):
+  ${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
    forecast(
      avg:gcp.cloudsql.database.disk.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
      '${var.disk_utilization_forecast_algorithm}',
@ -161,7 +161,7 @@ resource "datadog_monitor" "memory_utilization" {
  type = "metric alert"

  query = <<EOF
-  avg(${var.memory_utilization_timeframe}):
+  ${var.memory_utilization_time_aggregator}(${var.memory_utilization_timeframe}):
    avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}}
    by {database_id} * 100
  > ${var.memory_utilization_threshold_critical}
@ -206,7 +206,7 @@ resource "datadog_monitor" "memory_utilization_forecast" {
  type = "query alert"

  query = <<EOF
-  max(${var.memory_utilization_forecast_timeframe}):
+  ${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
    forecast(
      avg:gcp.cloudsql.database.memory.utilization{${data.template_file.filter.rendered}} by {database_id} * 100,
      '${var.memory_utilization_forecast_algorithm}',
@ -256,7 +256,7 @@ resource "datadog_monitor" "failover_unavailable" {
  type = "metric alert"

  query = <<EOF
-  max(${var.failover_unavailable_timeframe}):
+  ${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
    avg:gcp.cloudsql.database.available_for_failover{${data.template_file.filter.rendered}}
    by {database_id}
  <= ${var.failover_unavailable_threshold_critical}
--- a/cloud/gcp/cloud-sql/mysql/README.md
+++ b/cloud/gcp/cloud-sql/mysql/README.md
@ -43,6 +43,7 @@ Creates DataDog monitors with the following checks:
 | queries_changing_anomaly_threshold_critical | Queries Changing critical threshold | string | `1` | no |
 | queries_changing_anomaly_threshold_critical_recovery | Queries Changing critical recovery threshold | string | `0.99` | no |
 | queries_changing_anomaly_threshold_warning | Queries Changing warning threshold | string | `0.5` | no |
+| queries_changing_anomaly_time_aggregator | Time aggregator for the Queries Changing mon monitor | string | `avg` | no |
 | queries_changing_anomaly_timeframe | Timeframe for the Queries Changing mon monitor | string | `last_1h` | no |
 | questions_changing_anomaly_alert_window | Alert window. | string | `last_30m` | no |
 | questions_changing_anomaly_count_default_zero | Count default zero. | string | `false` | no |
@ -57,12 +58,14 @@ Creates DataDog monitors with the following checks:
 | questions_changing_anomaly_threshold_critical | Questions Changing critical threshold | string | `1` | no |
 | questions_changing_anomaly_threshold_critical_recovery | Questions Changing critical recovery threshold | string | `0.99` | no |
 | questions_changing_anomaly_threshold_warning | Questions Changing warning threshold | string | `0.5` | no |
+| questions_changing_anomaly_time_aggregator | Time aggregator for the Questions Changing monitor | string | `avg` | no |
 | questions_changing_anomaly_timeframe | Timeframe for the Questions Changing monitor | string | `last_1h` | no |
 | replication_lag_extra_tags | Extra tags for GCP Cloud SQL SQL Replication monitor | list | `<list>` | no |
 | replication_lag_message | Custom message for the Replication Lag monitor | string | `` | no |
 | replication_lag_silenced | Groups to mute for GCP Cloud SQL Replication Lag monitor | map | `<map>` | no |
 | replication_lag_threshold_critical | Seconds behind the master (critical threshold) | string | `180` | no |
 | replication_lag_threshold_warning | Seconds behind the master (warning threshold) | string | `90` | no |
+| replication_lag_time_aggregator | Time aggregator for the Replication Lag monitor | string | `min` | no |
 | replication_lag_timeframe | Timeframe for the Replication Lag monitor | string | `last_10m` | no |

 ## Outputs
--- a/cloud/gcp/cloud-sql/mysql/inputs.tf
+++ b/cloud/gcp/cloud-sql/mysql/inputs.tf
@ -43,6 +43,12 @@ variable "replication_lag_message" {
  default     = ""
 }

+variable "replication_lag_time_aggregator" {
+  description = "Time aggregator for the Replication Lag monitor"
+  type        = "string"
+  default     = "min"
+}
+
 variable "replication_lag_timeframe" {
  description = "Timeframe for the Replication Lag monitor"
  type        = "string"
@ -83,6 +89,12 @@ variable "queries_changing_anomaly_message" {
  default     = ""
 }

+variable "queries_changing_anomaly_time_aggregator" {
+  description = "Time aggregator for the Queries Changing mon monitor"
+  type        = "string"
+  default     = "avg"
+}
+
 variable "queries_changing_anomaly_timeframe" {
  description = "Timeframe for the Queries Changing mon monitor"
  type        = "string"
@ -171,6 +183,12 @@ variable "questions_changing_anomaly_message" {
  default     = ""
 }

+variable "questions_changing_anomaly_time_aggregator" {
+  description = "Time aggregator for the Questions Changing monitor"
+  type        = "string"
+  default     = "avg"
+}
+
 variable "questions_changing_anomaly_timeframe" {
  description = "Timeframe for the Questions Changing monitor"
  type        = "string"
--- a/cloud/gcp/cloud-sql/mysql/monitors-cloudsql-mysql.tf
+++ b/cloud/gcp/cloud-sql/mysql/monitors-cloudsql-mysql.tf
@ -21,7 +21,7 @@ resource "datadog_monitor" "replication_lag" {
  type = "metric alert"

  query = <<EOF
-  min(${var.replication_lag_timeframe}):
+  ${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}):
    avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${data.template_file.filter.rendered}}
    by {database_id}
  > ${var.replication_lag_threshold_critical}
@ -67,7 +67,7 @@ resource "datadog_monitor" "queries_changing_anomaly" {
  type = "query alert"

  query = <<EOF
-    avg(${var.queries_changing_anomaly_timeframe}):
+    ${var.queries_changing_anomaly_time_aggregator}(${var.queries_changing_anomaly_timeframe}):
      anomalies(
        avg:gcp.cloudsql.database.mysql.queries{${data.template_file.filter.rendered}} by {database_id}.as_count(),
        '${var.queries_changing_anomaly_detection_algorithm}',
@ -121,7 +121,7 @@ resource "datadog_monitor" "questions_changing_anomaly" {
  type = "query alert"

  query = <<EOF
-    avg(${var.questions_changing_anomaly_timeframe}):
+    ${var.questions_changing_anomaly_time_aggregator}(${var.questions_changing_anomaly_timeframe}):
      anomalies(
        avg:gcp.cloudsql.database.mysql.questions{${data.template_file.filter.rendered}} by {database_id},
        '${var.questions_changing_anomaly_detection_algorithm}',