From 4c474be541eb5f3a3f14bf2a8bd7716803651ecf Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 16:42:58 +0100
Subject: [PATCH] MON-80 Add monitors and update variables

---
 cloud/azure/iothubs/inputs.tf           | 44 +++++++++++-----
 cloud/azure/iothubs/monitors-iothubs.tf | 69 ++++++++++++++++++++-----
 2 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 5de7dab..38b1b44 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -1,38 +1,54 @@
-variable "critical_escalation_group" {}
-
-variable "warning_escalation_group" {}
-
 variable "environment" {}
 
 variable "stack" {}
 
 variable "client_name" {}
 
-## IOT hubs
 variable "delay" {
   default = 600
 }
 
-variable "warning_jobs_failed" {
-  default = 5
+## IOT hubs
+variable "jobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_jobs_failed" {
+variable "jobs_failed_threshold_critical" {
   default = 10
 }
 
-variable "warning_listjobs_failed" {
-  default = 5
+variable "jobs_failed_message" {}
+
+variable "listjobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_listjobs_failed" {
+variable "listjobs_failed_threshold_critical" {
   default = 10
 }
 
-variable "warning_queryjobs_failed" {
-  default = 5
+variable "listjobs_failed_message" {}
+
+variable "queryjobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_queryjobs_failed" {
+variable "queryjobs_failed_threshold_critical" {
+  default = 10
+}
+
+variable "queryjobs_failed_message" {}
+
+variable "status_message" {}
+
+variable "total_devices_message" {}
+
+variable "c2d_methods_failed_message" {}
+
+variable "c2d_methods_failed_threshold_warning" {
+  default = 0
+}
+
+variable "c2d_methods_failed_threshold_critical" {
   default = 10
 }
\ No newline at end of file
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index e333808..12f3d9a 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,13 +1,13 @@
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] Too many jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.jobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_jobs_failed}"
-    critical = "${var.critical_jobs_failed}"
+    warning  = "${var.jobs_failed_threshold_warning}"
+    critical = "${var.jobs_failed_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.listjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_listjobs_failed}"
-    critical = "${var.critical_listjobs_failed}"
+    warning  = "${var.listjobs_failed_threshold_warning}"
+    critical = "${var.listjobs_failed_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.queryjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_queryjobs_failed}"
-    critical = "${var.critical_queryjobs_failed}"
+    warning  = "${var.queryjobs_failed_threshold_warning}"
+    critical = "${var.queryjobs_failed_threshold_warning}"
   }
 
   notify_no_data      = false
@@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Status is not ok on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.status_message}"
 
   query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
   type  = "query alert"
 
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "total_devices" {
+  name    = "[${var.environment}] Total devices is wrong on {{name}} "
+  message = "${var.total_devices_message}"
+
+  query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0"
+  type  = "query alert"
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_c2d_methods_failed" {
+  name    = "[${var.environment}] Too many c2d methods failure on {{name}} "
+  message = "${var.c2d_methods_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.c2d_methods_failed_threshold_warning}"
+    critical = "${var.c2d_methods_failed_threshold_critical}"
+  }
+
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
   renotify_interval   = 60