From e4e929ec1d6bb380da5eed54bd324ab33fc513ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 12:47:19 +0100
Subject: [PATCH 01/93] MON-78 Add datadog monitor for stream analytics

---
 cloud/azure/stream-analytics/inputs.tf        | 44 +++++++++
 .../monitors-stream-analytics.tf              | 92 +++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 cloud/azure/stream-analytics/inputs.tf
 create mode 100644 cloud/azure/stream-analytics/monitors-stream-analytics.tf

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
new file mode 100644
index 0000000..e9bc507
--- /dev/null
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -0,0 +1,44 @@
+variable "hno_escalation_group" {}
+variable "ho_escalation_group" {}
+
+variable "environment" {}
+
+variable "notify_no_data" {
+  default = "false"
+}
+
+variable "delay" {
+  default = "600"
+}
+
+variable "su_utilization_warning" {
+  default = 60
+}
+
+variable "su_utilization_critical" {
+  default = 80
+}
+
+variable "failed_function_requests_warning" {
+  default = 0
+}
+
+variable "failed_function_requests_critical" {
+  default = 10
+}
+
+variable "conversion_errors_warning" {
+  default = 0
+}
+
+variable "conversion_errors_critical" {
+  default = 10
+}
+
+variable "runtime_errors_warning" {
+  default = 0
+}
+
+variable "runtime_errors_critical" {
+  default = 0
+}
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
new file mode 100644
index 0000000..f18d7f1
--- /dev/null
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -0,0 +1,92 @@
+resource "datadog_monitor" "SU_utilization" {
+  name    = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]"
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}"
+  type  = "query alert"
+
+  notify_no_data      = "${var.notify_no_data}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+  thresholds {
+    warning  = "${var.su_utilization_warning}"
+    critical = "${var.su_utilization_critical}"
+  }
+}
+
+resource "datadog_monitor" "failed_function_requests" {
+  name    = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]"
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
+  type  = "query alert"
+
+  notify_no_data      = "${var.notify_no_data}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+  thresholds {
+    warning  = "${var.failed_function_requests_warning}"
+    critical = "${var.failed_function_requests_critical}"
+  }
+}
+
+resource "datadog_monitor" "conversion_errors" {
+  name    = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]"
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
+  type  = "query alert"
+
+  notify_no_data      = "${var.notify_no_data}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+  thresholds {
+    warning  = "${var.conversion_errors_warning}"
+    critical = "${var.conversion_errors_critical}"
+  }
+}
+
+resource "datadog_monitor" "runtime_errors" {
+  name    = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]"
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
+  type  = "query alert"
+
+  notify_no_data      = "${var.notify_no_data}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+  thresholds {
+    warning  = "${var.runtime_errors_warning}"
+    critical = "${var.runtime_errors_critical}"
+  }
+}
+

From 17fa260daf594ab65043310053f0f534d49bff7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 13:08:17 +0100
Subject: [PATCH 02/93] MON-78 Corrected bad warning value for runtime_errors

---
 cloud/azure/stream-analytics/inputs.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index e9bc507..4ea5ee6 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -40,5 +40,5 @@ variable "runtime_errors_warning" {
 }
 
 variable "runtime_errors_critical" {
-  default = 0
+  default = 10
 }

From daabb7244af225ccffc2580fbb2b441586163bba Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 14:30:05 +0100
Subject: [PATCH 03/93] MON-80 Add inputs and monitors files

---
 cloud/azure/iothubs/inputs.tf           | 36 ++++++++++
 cloud/azure/iothubs/monitors-iothubs.tf | 90 +++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 cloud/azure/iothubs/inputs.tf
 create mode 100644 cloud/azure/iothubs/monitors-iothubs.tf

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
new file mode 100644
index 0000000..ddc3456
--- /dev/null
+++ b/cloud/azure/iothubs/inputs.tf
@@ -0,0 +1,36 @@
+variable "hno_escalation_group" {}
+
+variable "ho_escalation_group" {}
+
+variable "environment" {}
+
+variable "subscription_id" {}
+
+## IOT hubs
+variable "delay" {
+  default = 600
+}
+
+variable "warning_jobs_failed" {
+  default = 5
+}
+
+variable "critical_jobs_failed" {
+  default = 10
+}
+
+variable "warning_listjobs_failed" {
+  default = 5
+}
+
+variable "critical_listjobs_failed" {
+  default = 10
+}
+
+variable "warning_queryjobs_failed" {
+  default = 5
+}
+
+variable "critical_queryjobs_failed" {
+  default = 10
+}
\ No newline at end of file
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
new file mode 100644
index 0000000..5f584db
--- /dev/null
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -0,0 +1,90 @@
+resource "datadog_monitor" "too_many_jobs_failed" {
+  name    = "[${var.environment}] Too many jobs failed on {{name}} "
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.warning_jobs_failed}"
+    critical = "${var.critical_jobs_failed}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_list_jobs_failed" {
+  name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.warning_listjobs_failed}"
+    critical = "${var.critical_listjobs_failed}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_query_jobs_failed" {
+  name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.warning_queryjobs_failed}"
+    critical = "${var.critical_queryjobs_failed}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "status" {
+  name    = "[${var.environment}] Status is not ok on {{name}} "
+  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
+  type  = "query alert"
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
\ No newline at end of file

From 7f0a0e91cf6fdd3cb6ea5d33abcbf2dbdb41c0a2 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 15:21:40 +0100
Subject: [PATCH 04/93] MON-80 Rename variable for message alerting

---
 cloud/azure/iothubs/inputs.tf           | 8 +++++---
 cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index ddc3456..5de7dab 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -1,10 +1,12 @@
-variable "hno_escalation_group" {}
+variable "critical_escalation_group" {}
 
-variable "ho_escalation_group" {}
+variable "warning_escalation_group" {}
 
 variable "environment" {}
 
-variable "subscription_id" {}
+variable "stack" {}
+
+variable "client_name" {}
 
 ## IOT hubs
 variable "delay" {
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 5f584db..e333808 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,6 +1,6 @@
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] Too many jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
   type  = "query alert"
@@ -24,7 +24,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
   type  = "query alert"
@@ -48,7 +48,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
   type  = "query alert"
@@ -72,7 +72,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Status is not ok on {{name}} "
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
   type  = "query alert"

From 4c474be541eb5f3a3f14bf2a8bd7716803651ecf Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 16:42:58 +0100
Subject: [PATCH 05/93] MON-80 Add monitors and update variables

---
 cloud/azure/iothubs/inputs.tf           | 44 +++++++++++-----
 cloud/azure/iothubs/monitors-iothubs.tf | 69 ++++++++++++++++++++-----
 2 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 5de7dab..38b1b44 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -1,38 +1,54 @@
-variable "critical_escalation_group" {}
-
-variable "warning_escalation_group" {}
-
 variable "environment" {}
 
 variable "stack" {}
 
 variable "client_name" {}
 
-## IOT hubs
 variable "delay" {
   default = 600
 }
 
-variable "warning_jobs_failed" {
-  default = 5
+## IOT hubs
+variable "jobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_jobs_failed" {
+variable "jobs_failed_threshold_critical" {
   default = 10
 }
 
-variable "warning_listjobs_failed" {
-  default = 5
+variable "jobs_failed_message" {}
+
+variable "listjobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_listjobs_failed" {
+variable "listjobs_failed_threshold_critical" {
   default = 10
 }
 
-variable "warning_queryjobs_failed" {
-  default = 5
+variable "listjobs_failed_message" {}
+
+variable "queryjobs_failed_threshold_warning" {
+  default = 0
 }
 
-variable "critical_queryjobs_failed" {
+variable "queryjobs_failed_threshold_critical" {
+  default = 10
+}
+
+variable "queryjobs_failed_message" {}
+
+variable "status_message" {}
+
+variable "total_devices_message" {}
+
+variable "c2d_methods_failed_message" {}
+
+variable "c2d_methods_failed_threshold_warning" {
+  default = 0
+}
+
+variable "c2d_methods_failed_threshold_critical" {
   default = 10
 }
\ No newline at end of file
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index e333808..12f3d9a 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,13 +1,13 @@
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] Too many jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.jobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_jobs_failed}"
-    critical = "${var.critical_jobs_failed}"
+    warning  = "${var.jobs_failed_threshold_warning}"
+    critical = "${var.jobs_failed_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.listjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_listjobs_failed}"
-    critical = "${var.critical_listjobs_failed}"
+    warning  = "${var.listjobs_failed_threshold_warning}"
+    critical = "${var.listjobs_failed_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.queryjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.warning_queryjobs_failed}"
-    critical = "${var.critical_queryjobs_failed}"
+    warning  = "${var.queryjobs_failed_threshold_warning}"
+    critical = "${var.queryjobs_failed_threshold_warning}"
   }
 
   notify_no_data      = false
@@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Status is not ok on {{name}} "
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.status_message}"
 
   query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
   type  = "query alert"
 
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "total_devices" {
+  name    = "[${var.environment}] Total devices is wrong on {{name}} "
+  message = "${var.total_devices_message}"
+
+  query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0"
+  type  = "query alert"
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_c2d_methods_failed" {
+  name    = "[${var.environment}] Too many c2d methods failure on {{name}} "
+  message = "${var.c2d_methods_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.c2d_methods_failed_threshold_warning}"
+    critical = "${var.c2d_methods_failed_threshold_critical}"
+  }
+
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
   renotify_interval   = 60

From effaaf0e12d6446510700e4bc71765c8a0b37441 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 17:13:42 +0100
Subject: [PATCH 06/93] MON-80 Add c2d and d2c monitors

---
 cloud/azure/iothubs/inputs.tf           | 46 +++++++++++-
 cloud/azure/iothubs/monitors-iothubs.tf | 98 ++++++++++++++++++++++++-
 2 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 38b1b44..093b3a3 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -43,12 +43,52 @@ variable "status_message" {}
 
 variable "total_devices_message" {}
 
-variable "c2d_methods_failed_message" {}
-
 variable "c2d_methods_failed_threshold_warning" {
   default = 0
 }
 
 variable "c2d_methods_failed_threshold_critical" {
   default = 10
-}
\ No newline at end of file
+}
+
+variable "c2d_methods_failed_message" {}
+
+variable "c2d_twin_read_failed_threshold_warning" {
+  default = 0
+}
+
+variable "c2d_twin_read_failed_threshold_critical" {
+  default = 10
+}
+
+variable "c2d_twin_read_failed_message" {}
+
+variable "c2d_twin_update_failed_threshold_warning" {
+  default = 0
+}
+
+variable "c2d_twin_update_failed_threshold_critical" {
+  default = 10
+}
+
+variable "c2d_twin_update_failed_message" {}
+
+variable "d2c_twin_read_failed_threshold_warning" {
+  default = 0
+}
+
+variable "d2c_twin_read_failed_threshold_critical" {
+  default = 10
+}
+
+variable "d2c_twin_read_failed_message" {}
+
+variable "d2c_twin_update_failed_threshold_warning" {
+  default = 0
+}
+
+variable "d2c_twin_update_failed_threshold_critical" {
+  default = 10
+}
+
+variable "d2c_twin_update_failed_message" {}
\ No newline at end of file
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 12f3d9a..8d44dde 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -130,4 +130,100 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
-}
\ No newline at end of file
+}
+
+resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
+  name    = "[${var.environment}] Too many c2d twin read failure on {{name}} "
+  message = "${var.c2d_twin_read_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.c2d_twin_read_failed_threshold_warning}"
+    critical = "${var.c2d_twin_read_failed_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
+  name    = "[${var.environment}] Too many c2d twin update failure on {{name}} "
+  message = "${var.c2d_twin_update_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.c2d_twin_update_failed_threshold_warning}"
+    critical = "${var.c2d_twin_update_failed_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
+  name    = "[${var.environment}] Too many d2c twin read failure on {{name}} "
+  message = "${var.d2c_twin_read_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_twin_read_failed_threshold_warning}"
+    critical = "${var.d2c_twin_read_failed_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
+  name    = "[${var.environment}] Too many d2c twin update failure on {{name}} "
+  message = "${var.d2c_twin_update_failed_message}"
+
+  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_twin_update_failed_threshold_warning}"
+    critical = "${var.d2c_twin_update_failed_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}

From 5136dd5c4d1e5b3bea7685b95f8361d21a02dc34 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 17:30:16 +0100
Subject: [PATCH 07/93] MON-80 Add subscription_id

---
 cloud/azure/iothubs/inputs.tf           |  2 ++
 cloud/azure/iothubs/monitors-iothubs.tf | 40 ++++++++++++-------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 093b3a3..e705d8f 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -4,6 +4,8 @@ variable "stack" {}
 
 variable "client_name" {}
 
+variable "subscription_id" {}
+
 variable "delay" {
   default = 600
 }
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 8d44dde..a4ec018 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] Too many jobs failed on {{name}} "
   message = "${var.jobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -12,7 +12,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
   message = "${var.listjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
   message = "${var.queryjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
+  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -60,7 +60,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -74,12 +74,12 @@ resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Status is not ok on {{name}} "
   message = "${var.status_message}"
 
-  query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
+  query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1"
   type  = "query alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -93,12 +93,12 @@ resource "datadog_monitor" "total_devices" {
   name    = "[${var.environment}] Total devices is wrong on {{name}} "
   message = "${var.total_devices_message}"
 
-  query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0"
+  query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0"
   type  = "query alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -112,7 +112,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   name    = "[${var.environment}] Too many c2d methods failure on {{name}} "
   message = "${var.c2d_methods_failed_message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -122,7 +122,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -136,7 +136,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   name    = "[${var.environment}] Too many c2d twin read failure on {{name}} "
   message = "${var.c2d_twin_read_failed_message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -146,7 +146,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -160,7 +160,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   name    = "[${var.environment}] Too many c2d twin update failure on {{name}} "
   message = "${var.c2d_twin_update_failed_message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -170,7 +170,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -184,7 +184,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   name    = "[${var.environment}] Too many d2c twin read failure on {{name}} "
   message = "${var.d2c_twin_read_failed_message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -194,7 +194,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -208,7 +208,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   name    = "[${var.environment}] Too many d2c twin update failure on {{name}} "
   message = "${var.d2c_twin_update_failed_message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -218,7 +218,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true

From 193352c212277fac91d51ad336b704f7cde8d54c Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 30 Oct 2017 18:09:03 +0100
Subject: [PATCH 08/93] MON-80 Add IOT Hub in Names

---
 cloud/azure/iothubs/monitors-iothubs.tf | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index a4ec018..f111897 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,5 +1,5 @@
 resource "datadog_monitor" "too_many_jobs_failed" {
-  name    = "[${var.environment}] Too many jobs failed on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
   message = "${var.jobs_failed_message}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
@@ -23,7 +23,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 }
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
-  name    = "[${var.environment}] Too many list_jobs failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
   message = "${var.listjobs_failed_message}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
@@ -47,7 +47,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 }
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
-  name    = "[${var.environment}] Too many query_jobs failed on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
   message = "${var.queryjobs_failed_message}"
 
   query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
@@ -55,7 +55,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
   thresholds {
     warning  = "${var.queryjobs_failed_threshold_warning}"
-    critical = "${var.queryjobs_failed_threshold_warning}"
+    critical = "${var.queryjobs_failed_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -71,7 +71,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 }
 
 resource "datadog_monitor" "status" {
-  name    = "[${var.environment}] Status is not ok on {{name}} "
+  name    = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
   message = "${var.status_message}"
 
   query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1"
@@ -90,7 +90,7 @@ resource "datadog_monitor" "status" {
 }
 
 resource "datadog_monitor" "total_devices" {
-  name    = "[${var.environment}] Total devices is wrong on {{name}} "
+  name    = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
   message = "${var.total_devices_message}"
 
   query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0"
@@ -109,7 +109,7 @@ resource "datadog_monitor" "total_devices" {
 }
 
 resource "datadog_monitor" "too_many_c2d_methods_failed" {
-  name    = "[${var.environment}] Too many c2d methods failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
   message = "${var.c2d_methods_failed_message}"
 
   query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
@@ -133,7 +133,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
-  name    = "[${var.environment}] Too many c2d twin read failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
   message = "${var.c2d_twin_read_failed_message}"
 
   query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
@@ -157,7 +157,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
-  name    = "[${var.environment}] Too many c2d twin update failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
   message = "${var.c2d_twin_update_failed_message}"
 
   query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
@@ -181,7 +181,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
-  name    = "[${var.environment}] Too many d2c twin read failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
   message = "${var.d2c_twin_read_failed_message}"
 
   query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
@@ -205,7 +205,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
-  name    = "[${var.environment}] Too many d2c twin update failure on {{name}} "
+  name    = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
   message = "${var.d2c_twin_update_failed_message}"
 
   query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"

From 113d4aabd25fa1172dea821ef6b6f7688011f960 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 11:12:26 +0100
Subject: [PATCH 09/93] MON-80 Add monitors for telemetry

---
 cloud/azure/iothubs/inputs.tf           |  44 ++++++++-
 cloud/azure/iothubs/monitors-iothubs.tf | 121 +++++++++++++++++++++++-
 2 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index e705d8f..5ae0587 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -93,4 +93,46 @@ variable "d2c_twin_update_failed_threshold_critical" {
   default = 10
 }
 
-variable "d2c_twin_update_failed_message" {}
\ No newline at end of file
+variable "d2c_twin_update_failed_message" {}
+
+variable "d2c_telemetry_egress_dropped_threshold_warning" {
+  default = 500
+}
+
+variable "d2c_telemetry_egress_dropped_threshold_critical" {
+  default = 1000
+}
+
+variable "d2c_telemetry_egress_dropped_message" {}
+
+variable "d2c_telemetry_egress_orphaned_threshold_warning" {
+  default = 500
+}
+
+variable "d2c_telemetry_egress_orphaned_threshold_critical" {
+  default = 1000
+}
+
+variable "d2c_telemetry_egress_orphaned_message" {}
+
+variable "d2c_telemetry_egress_invalid_threshold_warning" {
+  default = 500
+}
+
+variable "d2c_telemetry_egress_invalid_threshold_critical" {
+  default = 1000
+}
+
+variable "d2c_telemetry_egress_invalid_message" {}
+
+variable "d2c_telemetry_egress_fallback_threshold_warning" {
+  default = 500
+}
+
+variable "d2c_telemetry_egress_fallback_threshold_critical" {
+  default = 1000
+}
+
+variable "d2c_telemetry_egress_fallback_message" {}
+
+variable "d2c_telemetry_ingress_nosent_message" {}
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index f111897..4c59099 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
   message = "${var.jobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
   message = "${var.listjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
   message = "${var.queryjobs_failed_message}"
 
-  query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
+  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -227,3 +227,118 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 }
+
+resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
+  name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
+  message = "${var.d2c_telemetry_egress_dropped_message}"
+
+  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_telemetry_egress_dropped_threshold_warning}"
+    critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
+  name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
+  message = "${var.d2c_telemetry_egress_orphaned_message}"
+
+  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_telemetry_egress_orphaned_threshold_warning}"
+    critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
+  name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
+  message = "${var.d2c_telemetry_egress_invalid_message}"
+
+  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_telemetry_egress_invalid_threshold_warning}"
+    critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
+  name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
+  message = "${var.d2c_telemetry_egress_fallback_message}"
+
+  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.d2c_telemetry_egress_fallback_threshold_warning}"
+    critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
+  name    = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
+  message = "${var.d2c_telemetry_ingress_nosent_message}"
+
+  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0"
+  type  = "query alert"
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}

From 34ef735a076884ef27474431a0df695b9228858e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 14:18:15 +0100
Subject: [PATCH 10/93] MON-78: Changed host.identifier for name to identify
 the streamanalytics obkect with issues

---
 .../azure/stream-analytics/monitors-stream-analytics.tf  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index f18d7f1..ea2920f 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,5 +1,5 @@
 resource "datadog_monitor" "SU_utilization" {
-  name    = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]"
+  name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}"
   message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}"
@@ -22,7 +22,7 @@ resource "datadog_monitor" "SU_utilization" {
 }
 
 resource "datadog_monitor" "failed_function_requests" {
-  name    = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]"
+  name    = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}"
   message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
@@ -45,7 +45,7 @@ resource "datadog_monitor" "failed_function_requests" {
 }
 
 resource "datadog_monitor" "conversion_errors" {
-  name    = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]"
+  name    = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}"
   message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
@@ -68,7 +68,7 @@ resource "datadog_monitor" "conversion_errors" {
 }
 
 resource "datadog_monitor" "runtime_errors" {
-  name    = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]"
+  name    = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}"
   message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
@@ -89,4 +89,3 @@ resource "datadog_monitor" "runtime_errors" {
     critical = "${var.runtime_errors_critical}"
   }
 }
-

From cf3309ce753a146901fce7d1bcb4871639f8d410 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 11:47:37 +0100
Subject: [PATCH 11/93] MON-80 Add README.md

---
 cloud/azure/iothubs/README.md | 109 ++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 cloud/azure/iothubs/README.md

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
new file mode 100644
index 0000000..d53bf2b
--- /dev/null
+++ b/cloud/azure/iothubs/README.md
@@ -0,0 +1,109 @@
+Azure Redis DataDog monitors
+============================
+
+How to use this module
+----------------------
+
+```
+module "iothubs" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
+  
+  jobs_failed_message                   = "${module.datadog-message-alerting.alerting-message}"
+  listjobs_failed_message               = "${module.datadog-message-alerting.alerting-message}"
+  queryjobs_failed_message              = "${module.datadog-message-alerting.alerting-message}"
+  status_message                        = "${module.datadog-message-alerting.alerting-message}"
+  total_devices_message                 = "${module.datadog-message-alerting.alerting-message}"
+  c2d_methods_failed_message            = "${module.datadog-message-alerting.alerting-message}"
+  c2d_twin_read_failed_message          = "${module.datadog-message-alerting.alerting-message}"
+  c2d_twin_update_failed_message        = "${module.datadog-message-alerting.alerting-message}"
+  d2c_twin_read_failed_message          = "${module.datadog-message-alerting.alerting-message}"
+  d2c_twin_update_failed_message        = "${module.datadog-message-alerting.alerting-message}"
+  d2c_telemetry_egress_dropped_message  = "${module.datadog-message-alerting.alerting-message}"
+  d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}"
+  d2c_telemetry_egress_invalid_message  = "${module.datadog-message-alerting.alerting-message}"
+  d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}"
+  d2c_telemetry_ingress_nosent_message  = "${module.datadog-message-alerting.alerting-message}"
+  
+  environment     = "${var.environment}"
+  stack           = "${var.stack}"
+  client_name     = "${var.client_name}"
+  subscription_id = "${var.subscription_id}"
+}
+```
+
+Purpose
+-------
+Creates a DataDog monitors with the following checks :
+
+* Service status check
+* Jobs failed average check
+* Query Jobs failed average check
+* List Jobs failed average check
+* Total devices count check
+* C2D methods failed average check
+* C2D twin read failed average check
+* C2D twin update failed average check
+* D2C twin read failed average check
+* D2C twin update failed average check
+* D2C telemetry egress dropped count check
+* D2C telemetry egress orphaned count check
+* D2C telemetry egress invalid count check
+* D2C telemetry egress fallback count check
+* D2C telemetry ingress no sent count check
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| c2d_methods_failed_message |  | string | - | yes |
+| c2d_methods_failed_threshold_critical |  | string | `10` | no |
+| c2d_methods_failed_threshold_warning |  | string | `0` | no |
+| c2d_twin_read_failed_message |  | string | - | yes |
+| c2d_twin_read_failed_threshold_critical |  | string | `10` | no |
+| c2d_twin_read_failed_threshold_warning |  | string | `0` | no |
+| c2d_twin_update_failed_message |  | string | - | yes |
+| c2d_twin_update_failed_threshold_critical |  | string | `10` | no |
+| c2d_twin_update_failed_threshold_warning |  | string | `0` | no |
+| client_name |  | string | - | yes |
+| d2c_telemetry_egress_dropped_message |  | string | - | yes |
+| d2c_telemetry_egress_dropped_threshold_critical |  | string | `1000` | no |
+| d2c_telemetry_egress_dropped_threshold_warning |  | string | `500` | no |
+| d2c_telemetry_egress_fallback_message |  | string | - | yes |
+| d2c_telemetry_egress_fallback_threshold_critical |  | string | `1000` | no |
+| d2c_telemetry_egress_fallback_threshold_warning |  | string | `500` | no |
+| d2c_telemetry_egress_invalid_message |  | string | - | yes |
+| d2c_telemetry_egress_invalid_threshold_critical |  | string | `1000` | no |
+| d2c_telemetry_egress_invalid_threshold_warning |  | string | `500` | no |
+| d2c_telemetry_egress_orphaned_message |  | string | - | yes |
+| d2c_telemetry_egress_orphaned_threshold_critical |  | string | `1000` | no |
+| d2c_telemetry_egress_orphaned_threshold_warning |  | string | `500` | no |
+| d2c_telemetry_ingress_nosent_message |  | string | - | yes |
+| d2c_twin_read_failed_message |  | string | - | yes |
+| d2c_twin_read_failed_threshold_critical |  | string | `10` | no |
+| d2c_twin_read_failed_threshold_warning |  | string | `0` | no |
+| d2c_twin_update_failed_message |  | string | - | yes |
+| d2c_twin_update_failed_threshold_critical |  | string | `10` | no |
+| d2c_twin_update_failed_threshold_warning |  | string | `0` | no |
+| delay |  | string | `600` | no |
+| environment |  | string | - | yes |
+| jobs_failed_message |  | string | - | yes |
+| jobs_failed_threshold_critical |  | string | `10` | no |
+| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no |
+| listjobs_failed_message |  | string | - | yes |
+| listjobs_failed_threshold_critical |  | string | `10` | no |
+| listjobs_failed_threshold_warning |  | string | `0` | no |
+| queryjobs_failed_message |  | string | - | yes |
+| queryjobs_failed_threshold_critical |  | string | `10` | no |
+| queryjobs_failed_threshold_warning |  | string | `0` | no |
+| stack |  | string | - | yes |
+| status_message |  | string | - | yes |
+| subscription_id |  | string | - | yes |
+| total_devices_message |  | string | - | yes |
+
+Related documentation
+---------------------
+
+DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/
+
+Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health
\ No newline at end of file

From 51b3b5010da96533a605c94f2d9e6d44ea05f495 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 15:22:52 +0100
Subject: [PATCH 12/93] MON-78 Changed variable names

---
 cloud/azure/stream-analytics/inputs.tf                 |  8 ++++++--
 .../stream-analytics/monitors-stream-analytics.tf      | 10 +++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 4ea5ee6..529e669 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -1,5 +1,5 @@
-variable "hno_escalation_group" {}
-variable "ho_escalation_group" {}
+variable "critical_escalation_group" {}
+variable "warning_escalation_group" {}
 
 variable "environment" {}
 
@@ -7,6 +7,10 @@ variable "notify_no_data" {
   default = "false"
 }
 
+variable "filter_tags" {
+  default = "*"
+}
+
 variable "delay" {
   default = "600"
 }
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index ea2920f..4e64044 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,8 +1,8 @@
 resource "datadog_monitor" "SU_utilization" {
   name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}"
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -23,7 +23,7 @@ resource "datadog_monitor" "SU_utilization" {
 
 resource "datadog_monitor" "failed_function_requests" {
   name    = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}"
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
   type  = "query alert"
@@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" {
 
 resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}"
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
   type  = "query alert"
@@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" {
 
 resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}"
-  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
   type  = "query alert"

From 54a90b3972a2a2a374f5a5726350f38ad2fdf52d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 15:34:57 +0100
Subject: [PATCH 13/93] MON-78 Removed upper case resource name

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 4e64044..68043f8 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,4 +1,4 @@
-resource "datadog_monitor" "SU_utilization" {
+resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}"
   message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
 

From 9261bde1588268650f9f1295489daf756257ff8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 15:51:00 +0100
Subject: [PATCH 14/93] MON-78: Remove escalation variables, add message
 variable

---
 cloud/azure/stream-analytics/inputs.tf                    | 3 +--
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 529e669..d240169 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -1,5 +1,4 @@
-variable "critical_escalation_group" {}
-variable "warning_escalation_group" {}
+variable "message" {}
 
 variable "environment" {}
 
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 68043f8..6cf42c5 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,6 +1,6 @@
 resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}"
   type  = "query alert"
@@ -23,7 +23,7 @@ resource "datadog_monitor" "su_utilization" {
 
 resource "datadog_monitor" "failed_function_requests" {
   name    = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
   type  = "query alert"
@@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" {
 
 resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
   type  = "query alert"
@@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" {
 
 resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
   type  = "query alert"

From 0b03cade41951578a3f6363b0733d31eee4e93e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 16:35:44 +0100
Subject: [PATCH 15/93] MON-78 Changing naming convention for variables

---
 cloud/azure/stream-analytics/inputs.tf        | 16 +++++-----
 .../monitors-stream-analytics.tf              | 32 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index d240169..2d0619a 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -14,34 +14,34 @@ variable "delay" {
   default = "600"
 }
 
-variable "su_utilization_warning" {
+variable "su_utilization_threshold_warning" {
   default = 60
 }
 
-variable "su_utilization_critical" {
+variable "su_utilization_threshold_critical" {
   default = 80
 }
 
-variable "failed_function_requests_warning" {
+variable "function_requests_threshold_warning" {
   default = 0
 }
 
-variable "failed_function_requests_critical" {
+variable "function_requests_threshold_critical" {
   default = 10
 }
 
-variable "conversion_errors_warning" {
+variable "conversion_errors_threshold_warning" {
   default = 0
 }
 
-variable "conversion_errors_critical" {
+variable "conversion_errors_threshold_critical" {
   default = 10
 }
 
-variable "runtime_errors_warning" {
+variable "runtime_errors_threshold_warning" {
   default = 0
 }
 
-variable "runtime_errors_critical" {
+variable "runtime_errors_threshold_critical" {
   default = 10
 }
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 6cf42c5..55ac674 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,8 +1,8 @@
 resource "datadog_monitor" "su_utilization" {
-  name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}"
+  name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -16,16 +16,16 @@ resource "datadog_monitor" "su_utilization" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
   thresholds {
-    warning  = "${var.su_utilization_warning}"
-    critical = "${var.su_utilization_critical}"
+    warning  = "${var.su_utilization_threshold_warning}"
+    critical = "${var.su_utilization_threshold_critical}"
   }
 }
 
 resource "datadog_monitor" "failed_function_requests" {
-  name    = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}"
+  name    = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -39,16 +39,16 @@ resource "datadog_monitor" "failed_function_requests" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
   thresholds {
-    warning  = "${var.failed_function_requests_warning}"
-    critical = "${var.failed_function_requests_critical}"
+    warning  = "${var.function_requests_threshold_warning}"
+    critical = "${var.function_requests_threshold_critical}"
   }
 }
 
 resource "datadog_monitor" "conversion_errors" {
-  name    = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}"
+  name    = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -62,16 +62,16 @@ resource "datadog_monitor" "conversion_errors" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
   thresholds {
-    warning  = "${var.conversion_errors_warning}"
-    critical = "${var.conversion_errors_critical}"
+    warning  = "${var.conversion_errors_threshold_warning}"
+    critical = "${var.conversion_errors_threshold_critical}"
   }
 }
 
 resource "datadog_monitor" "runtime_errors" {
-  name    = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}"
+  name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -85,7 +85,7 @@ resource "datadog_monitor" "runtime_errors" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
   thresholds {
-    warning  = "${var.runtime_errors_warning}"
-    critical = "${var.runtime_errors_critical}"
+    warning  = "${var.runtime_errors_threshold_warning}"
+    critical = "${var.runtime_errors_threshold_critical}"
   }
 }

From 0706a50badd6a4b442fe3afa6ab82712197572b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 17:01:21 +0100
Subject: [PATCH 16/93] MON-78: Changed monitor name for better clarity

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 55ac674..ed4c51f 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,5 +1,5 @@
 resource "datadog_monitor" "su_utilization" {
-  name    = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
+  name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"

From 1f059622ed932ee209847dff647d30abc19ebdd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 17:46:21 +0100
Subject: [PATCH 17/93] MON-78 Changed filter to reach proper resources

---
 cloud/azure/stream-analytics/inputs.tf                    | 4 ++--
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 2d0619a..1c3ff2e 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -6,8 +6,8 @@ variable "notify_no_data" {
   default = "false"
 }
 
-variable "filter_tags" {
-  default = "*"
+variable "use_filter_tags" {
+  default = "true"
 }
 
 variable "delay" {
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index ed4c51f..6903b6a 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -25,7 +25,7 @@ resource "datadog_monitor" "failed_function_requests" {
   name    = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -48,7 +48,7 @@ resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"

From aaabb129b5ae66cc9b2e2f940bac0fc7e9f8ee91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 17:52:40 +0100
Subject: [PATCH 18/93] MON-78 Forgot a }

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 6903b6a..e95825e 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"

From 686765bcaa83e795f9608aad0f39c681e589477c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= <jeremy.nancel@fr.clara.net>
Date: Mon, 30 Oct 2017 18:00:06 +0100
Subject: [PATCH 19/93] MON-78 Corrected typo in query for runtime_errors

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index e95825e..6ca7717 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"

From f916fbfc81ffdfe273eafc6bcab98432faf1b0f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 12:00:56 +0100
Subject: [PATCH 20/93] MON-78: Readme

---
 cloud/azure/stream-analytics/README.md | 39 ++++++++++++++++++++++++++
 cloud/azure/stream-analytics/inputs.tf | 24 +++++++++++-----
 2 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 cloud/azure/stream-analytics/README.md

diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md
new file mode 100644
index 0000000..83d0af4
--- /dev/null
+++ b/cloud/azure/stream-analytics/README.md
@@ -0,0 +1,39 @@
+Azure Stream Analytics DataDog monitors
+=======================================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-redis" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+}
+```
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| conversion_errors_threshold_critical |  | string | `10` | no |
+| conversion_errors_threshold_warning |  | string | `0` | no |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| function_requests_threshold_critical |  | string | `10` | no |
+| function_requests_threshold_warning |  | string | `0` | no |
+| message | Message sent when a monitor is triggered | string | - | yes |
+| notify_no_data |  | string | `false` | no |
+| runtime_errors_threshold_critical |  | string | `10` | no |
+| runtime_errors_threshold_warning |  | string | `0` | no |
+| su_utilization_threshold_critical |  | string | `80` | no |
+| su_utilization_threshold_warning | Monitor specific | string | `60` | no |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
+Related documentation
+---------------------
+
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 1c3ff2e..29db469 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -1,19 +1,29 @@
-variable "message" {}
+# Global Terraform
+variable "environment" {
+  description = "Architecture environment"
+  type        = "string"
+}
 
-variable "environment" {}
+variable "message" {
+  description = "Message sent when a monitor is triggered"
+}
+
+# Global DataDog
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
 
 variable "notify_no_data" {
   default = "false"
 }
 
-variable "use_filter_tags" {
-  default = "true"
-}
-
 variable "delay" {
-  default = "600"
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
 }
 
+# Monitor specific
 variable "su_utilization_threshold_warning" {
   default = 60
 }

From 1a278fc81c90e853c0493132cd4f3e3f89858334 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 12:04:35 +0100
Subject: [PATCH 21/93] MON-78: Fixup use filter tag usage

---
 .../monitors-stream-analytics.tf                 | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 6ca7717..0972bd4 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -1,8 +1,16 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}"
+  }
+}
+
 resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -25,7 +33,7 @@ resource "datadog_monitor" "failed_function_requests" {
   name    = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -48,7 +56,7 @@ resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -71,7 +79,7 @@ resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
+  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"

From 41997c9afe58583177acb7915036c5cd8cbdd910 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 14:41:14 +0100
Subject: [PATCH 22/93] MON-78 Add EOF on querys

---
 .../monitors-stream-analytics.tf              | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 0972bd4..8824410 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -10,7 +10,11 @@ resource "datadog_monitor" "su_utilization" {
   name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.su_utilization_threshold_critical}
+  EOF
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -33,7 +37,11 @@ resource "datadog_monitor" "failed_function_requests" {
   name    = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.function_requests_threshold_critical}
+  EOF
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -56,7 +64,11 @@ resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.conversion_errors_threshold_critical}
+  EOF
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"
@@ -79,7 +91,11 @@ resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.runtime_errors_threshold_critical}
+  EOF
   type  = "query alert"
 
   notify_no_data      = "${var.notify_no_data}"

From c1563c331898b4ca8b2b08792e27d35e94affed2 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 14:25:24 +0100
Subject: [PATCH 23/93] MON-80 use only one message and add inputs descriptions

---
 cloud/azure/iothubs/README.md           |  97 ++++++-----------
 cloud/azure/iothubs/inputs.tf           |  76 +++++++-------
 cloud/azure/iothubs/monitors-iothubs.tf | 133 ++++++++++++++++++------
 3 files changed, 178 insertions(+), 128 deletions(-)

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index d53bf2b..3d6bb91 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -1,4 +1,4 @@
-Azure Redis DataDog monitors
+Azure IOT Hubs DataDog monitors
 ============================
 
 How to use this module
@@ -8,22 +8,8 @@ How to use this module
 module "iothubs" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
   
-  jobs_failed_message                   = "${module.datadog-message-alerting.alerting-message}"
-  listjobs_failed_message               = "${module.datadog-message-alerting.alerting-message}"
-  queryjobs_failed_message              = "${module.datadog-message-alerting.alerting-message}"
-  status_message                        = "${module.datadog-message-alerting.alerting-message}"
-  total_devices_message                 = "${module.datadog-message-alerting.alerting-message}"
-  c2d_methods_failed_message            = "${module.datadog-message-alerting.alerting-message}"
-  c2d_twin_read_failed_message          = "${module.datadog-message-alerting.alerting-message}"
-  c2d_twin_update_failed_message        = "${module.datadog-message-alerting.alerting-message}"
-  d2c_twin_read_failed_message          = "${module.datadog-message-alerting.alerting-message}"
-  d2c_twin_update_failed_message        = "${module.datadog-message-alerting.alerting-message}"
-  d2c_telemetry_egress_dropped_message  = "${module.datadog-message-alerting.alerting-message}"
-  d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}"
-  d2c_telemetry_egress_invalid_message  = "${module.datadog-message-alerting.alerting-message}"
-  d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}"
-  d2c_telemetry_ingress_nosent_message  = "${module.datadog-message-alerting.alerting-message}"
-  
+  message         = "${module.datadog-message-alerting.alerting-message}"
+   
   environment     = "${var.environment}"
   stack           = "${var.stack}"
   client_name     = "${var.client_name}"
@@ -56,54 +42,39 @@ Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
-| c2d_methods_failed_message |  | string | - | yes |
-| c2d_methods_failed_threshold_critical |  | string | `10` | no |
-| c2d_methods_failed_threshold_warning |  | string | `0` | no |
-| c2d_twin_read_failed_message |  | string | - | yes |
-| c2d_twin_read_failed_threshold_critical |  | string | `10` | no |
-| c2d_twin_read_failed_threshold_warning |  | string | `0` | no |
-| c2d_twin_update_failed_message |  | string | - | yes |
-| c2d_twin_update_failed_threshold_critical |  | string | `10` | no |
-| c2d_twin_update_failed_threshold_warning |  | string | `0` | no |
-| client_name |  | string | - | yes |
-| d2c_telemetry_egress_dropped_message |  | string | - | yes |
-| d2c_telemetry_egress_dropped_threshold_critical |  | string | `1000` | no |
-| d2c_telemetry_egress_dropped_threshold_warning |  | string | `500` | no |
-| d2c_telemetry_egress_fallback_message |  | string | - | yes |
-| d2c_telemetry_egress_fallback_threshold_critical |  | string | `1000` | no |
-| d2c_telemetry_egress_fallback_threshold_warning |  | string | `500` | no |
-| d2c_telemetry_egress_invalid_message |  | string | - | yes |
-| d2c_telemetry_egress_invalid_threshold_critical |  | string | `1000` | no |
-| d2c_telemetry_egress_invalid_threshold_warning |  | string | `500` | no |
-| d2c_telemetry_egress_orphaned_message |  | string | - | yes |
-| d2c_telemetry_egress_orphaned_threshold_critical |  | string | `1000` | no |
-| d2c_telemetry_egress_orphaned_threshold_warning |  | string | `500` | no |
-| d2c_telemetry_ingress_nosent_message |  | string | - | yes |
-| d2c_twin_read_failed_message |  | string | - | yes |
-| d2c_twin_read_failed_threshold_critical |  | string | `10` | no |
-| d2c_twin_read_failed_threshold_warning |  | string | `0` | no |
-| d2c_twin_update_failed_message |  | string | - | yes |
-| d2c_twin_update_failed_threshold_critical |  | string | `10` | no |
-| d2c_twin_update_failed_threshold_warning |  | string | `0` | no |
-| delay |  | string | `600` | no |
-| environment |  | string | - | yes |
-| jobs_failed_message |  | string | - | yes |
-| jobs_failed_threshold_critical |  | string | `10` | no |
-| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no |
-| listjobs_failed_message |  | string | - | yes |
-| listjobs_failed_threshold_critical |  | string | `10` | no |
-| listjobs_failed_threshold_warning |  | string | `0` | no |
-| queryjobs_failed_message |  | string | - | yes |
-| queryjobs_failed_threshold_critical |  | string | `10` | no |
-| queryjobs_failed_threshold_warning |  | string | `0` | no |
-| stack |  | string | - | yes |
-| status_message |  | string | - | yes |
-| subscription_id |  | string | - | yes |
-| total_devices_message |  | string | - | yes |
+| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
+| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
+| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| client_name | Client Name | string | - | yes |
+| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no |
+| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no |
+| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no |
+| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no |
+| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no |
+| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no |
+| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no |
+| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no |
+| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture Environment | string | - | yes |
+| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
+| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
+| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
+| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
+| message | Message sent when an alert is triggered | string | - | yes |
+| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
+| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
+| subscription_id | Subscription ID used to tag monitors | string | - | yes |
 
 Related documentation
 ---------------------
 
-DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
 
-Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health
\ No newline at end of file
+Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)
\ No newline at end of file
diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 5ae0587..cc591cd 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -1,138 +1,144 @@
-variable "environment" {}
+variable "environment" {
+  description = "Architecture Environment"
+  type        = "string"
+}
 
-variable "stack" {}
+variable "client_name" {
+  description = "Client Name"
+  type        = "string"
+}
 
-variable "client_name" {}
-
-variable "subscription_id" {}
+variable "subscription_id" {
+  description = "Subscription ID used to tag monitors"
+  type        = "string"
+}
 
 variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
   default = 600
 }
 
+variable "message" {
+  description = "Message sent when an alert is triggered"
+}
+
 ## IOT hubs
 variable "jobs_failed_threshold_warning" {
+  description = "Jobs Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "jobs_failed_threshold_critical" {
+  description = "Jobs Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "jobs_failed_message" {}
-
 variable "listjobs_failed_threshold_warning" {
+  description = "ListJobs Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "listjobs_failed_threshold_critical" {
+  description = "ListJobs Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "listjobs_failed_message" {}
-
 variable "queryjobs_failed_threshold_warning" {
+  description = "QueryJobs Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "queryjobs_failed_threshold_critical" {
+  description = "QueryJobs Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "queryjobs_failed_message" {}
-
-variable "status_message" {}
-
-variable "total_devices_message" {}
-
 variable "c2d_methods_failed_threshold_warning" {
+  description = "C2D Methods Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "c2d_methods_failed_threshold_critical" {
+  description = "C2D Methods Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "c2d_methods_failed_message" {}
-
 variable "c2d_twin_read_failed_threshold_warning" {
+  description = "C2D Twin Read Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "c2d_twin_read_failed_threshold_critical" {
+  description = "C2D Twin Read Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "c2d_twin_read_failed_message" {}
-
 variable "c2d_twin_update_failed_threshold_warning" {
+  description = "C2D Twin Update Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "c2d_twin_update_failed_threshold_critical" {
+  description = "C2D Twin Update Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "c2d_twin_update_failed_message" {}
-
 variable "d2c_twin_read_failed_threshold_warning" {
+  description = "D2C Twin Read Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "d2c_twin_read_failed_threshold_critical" {
+  description = "D2C Twin Read Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "d2c_twin_read_failed_message" {}
-
 variable "d2c_twin_update_failed_threshold_warning" {
+  description = "D2C Twin Update Failed rate limit (warning threshold)"
   default = 0
 }
 
 variable "d2c_twin_update_failed_threshold_critical" {
+  description = "D2C Twin Update Failed rate limit (critical threshold)"
   default = 10
 }
 
-variable "d2c_twin_update_failed_message" {}
-
 variable "d2c_telemetry_egress_dropped_threshold_warning" {
+  description = "D2C Telemetry Dropped Failed limit (warning threshold)"
   default = 500
 }
 
 variable "d2c_telemetry_egress_dropped_threshold_critical" {
+  description = "D2C Telemetry Dropped Failed limit (critical threshold)"
   default = 1000
 }
 
-variable "d2c_telemetry_egress_dropped_message" {}
-
 variable "d2c_telemetry_egress_orphaned_threshold_warning" {
+  description = "D2C Telemetry Orphaned Failed limit (warning threshold)"
   default = 500
 }
 
 variable "d2c_telemetry_egress_orphaned_threshold_critical" {
+  description = "D2C Telemetry Orphaned Failed limit (critical threshold)"
   default = 1000
 }
 
-variable "d2c_telemetry_egress_orphaned_message" {}
-
 variable "d2c_telemetry_egress_invalid_threshold_warning" {
+  description = "D2C Telemetry Invalid Failed limit (warning threshold)"
   default = 500
 }
 
 variable "d2c_telemetry_egress_invalid_threshold_critical" {
+  description = "D2C Telemetry Invalid Failed limit (critical threshold)"
   default = 1000
 }
 
-variable "d2c_telemetry_egress_invalid_message" {}
-
 variable "d2c_telemetry_egress_fallback_threshold_warning" {
+  description = "D2C Telemetry Fallback Failed limit (warning threshold)"
   default = 500
 }
 
 variable "d2c_telemetry_egress_fallback_threshold_critical" {
+  description = "D2C Telemetry Fallback Failed limit (critical threshold)"
   default = 1000
 }
-
-variable "d2c_telemetry_egress_fallback_message" {}
-
-variable "d2c_telemetry_ingress_nosent_message" {}
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 4c59099..f4a7073 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,8 +1,14 @@
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
-  message = "${var.jobs_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
+  query = <<EOF
+          avg(last_5m):(
+            avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+            ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+                avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+          ) * 100 > ${var.jobs_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -24,9 +30,15 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
-  message = "${var.listjobs_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
+  query = <<EOF
+          avg(last_5m):(
+            avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
+              ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
+                  avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
+          ) * 100 > ${var.listjobs_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -48,9 +60,15 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
-  message = "${var.queryjobs_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
+        ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
+            avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
+    ) * 100 > ${var.queryjobs_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -72,9 +90,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
-  message = "${var.status_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1"
+  query = <<EOF
+    avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1
+  EOF
   type  = "query alert"
 
   notify_no_data      = true
@@ -91,9 +111,11 @@ resource "datadog_monitor" "status" {
 
 resource "datadog_monitor" "total_devices" {
   name    = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
-  message = "${var.total_devices_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0"
+  query = <<EOF
+    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0
+  EOF
   type  = "query alert"
 
   notify_no_data      = true
@@ -110,9 +132,15 @@ resource "datadog_monitor" "total_devices" {
 
 resource "datadog_monitor" "too_many_c2d_methods_failed" {
   name    = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
-  message = "${var.c2d_methods_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+            avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+    ) * 100 > ${var.c2d_methods_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -134,9 +162,15 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 
 resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   name    = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
-  message = "${var.c2d_twin_read_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+            avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+    ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -158,9 +192,15 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 
 resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   name    = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
-  message = "${var.c2d_twin_update_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+      ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+    ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -182,9 +222,15 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 
 resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   name    = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
-  message = "${var.d2c_twin_read_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+    ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -206,9 +252,15 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 
 resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   name    = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
-  message = "${var.d2c_twin_update_failed_message}"
+  message = "${var.message}"
 
-  query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"
+  query = <<EOF
+    avg(last_5m):(
+      avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+    ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -230,9 +282,13 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
   name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
-  message = "${var.d2c_telemetry_egress_dropped_message}"
+  message = "${var.message}"
 
-  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}"
+  query = <<EOF
+      sum(last_5m): (
+        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+      ) > ${var.d2c_telemetry_egress_dropped_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -254,9 +310,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
   name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
-  message = "${var.d2c_telemetry_egress_orphaned_message}"
+  message = "${var.message}"
 
-  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}"
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+    ) > ${var.d2c_telemetry_egress_orphaned_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -278,9 +338,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
   name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
-  message = "${var.d2c_telemetry_egress_invalid_message}"
+  message = "${var.message}"
 
-  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}"
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+    ) > ${var.d2c_telemetry_egress_invalid_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -302,9 +366,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
   name    = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
-  message = "${var.d2c_telemetry_egress_fallback_message}"
+  message = "${var.message}"
 
-  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}"
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+    )  > ${var.d2c_telemetry_egress_fallback_threshold_critical}
+  EOF
   type  = "query alert"
 
   thresholds {
@@ -326,9 +394,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
 
 resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
   name    = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
-  message = "${var.d2c_telemetry_ingress_nosent_message}"
+  message = "${var.message}"
 
-  query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0"
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() -
+        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+    ) > 0
+  EOF
   type  = "query alert"
 
   notify_no_data      = false

From 9186c6915042ad8f969e05e94ce5db7a5a6fc188 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 15:37:13 +0100
Subject: [PATCH 24/93] MON-80 Now support use_filter_tags

---
 cloud/azure/iothubs/inputs.tf           | 5 +++++
 cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index cc591cd..d04d03b 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -8,6 +8,11 @@ variable "client_name" {
   type        = "string"
 }
 
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
+
 variable "subscription_id" {
   description = "Subscription ID used to tag monitors"
   type        = "string"
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index f4a7073..1ee29a3 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,3 +1,11 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}"
+  }
+}
+
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
   message = "${var.message}"

From 0b896d784b0db61fd975fe4876ca896e25c4c3ad Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 15:41:41 +0100
Subject: [PATCH 25/93] MON-78 Add Stream Analytics on several names to be more
 specific

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 8824410..6e6f651 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -34,7 +34,7 @@ resource "datadog_monitor" "su_utilization" {
 }
 
 resource "datadog_monitor" "failed_function_requests" {
-  name    = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
+  name    = "[${var.environment}] Stream Analytics : More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -61,7 +61,7 @@ resource "datadog_monitor" "failed_function_requests" {
 }
 
 resource "datadog_monitor" "conversion_errors" {
-  name    = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
+  name    = "[${var.environment}] Stream Analytics : More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -88,7 +88,7 @@ resource "datadog_monitor" "conversion_errors" {
 }
 
 resource "datadog_monitor" "runtime_errors" {
-  name    = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
+  name    = "[${var.environment}] Stream Analytics : More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
   query = <<EOF

From c4c64299b8ff76037f8c5ec876950915808302bb Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:21:44 +0100
Subject: [PATCH 26/93] MON-78 add subscription_id and tags

---
 cloud/azure/stream-analytics/README.md        |  3 +-
 cloud/azure/stream-analytics/inputs.tf        | 33 +++++++++++++++----
 .../monitors-stream-analytics.tf              | 18 +++++++---
 3 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md
index 83d0af4..f115e70 100644
--- a/cloud/azure/stream-analytics/README.md
+++ b/cloud/azure/stream-analytics/README.md
@@ -9,8 +9,8 @@ module "datadog-monitors-azure-redis" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}"
 
   message = "${module.datadog-message-alerting.alerting-message}"
-
   environment = "${var.environment}"
+  subscription_id = "${var.subscription_id}"
 }
 ```
 
@@ -31,6 +31,7 @@ Inputs
 | runtime_errors_threshold_warning |  | string | `0` | no |
 | su_utilization_threshold_critical |  | string | `80` | no |
 | su_utilization_threshold_warning | Monitor specific | string | `60` | no |
+| subscription_id | Azure account id used as filter for monitors | string | - | yes |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 
 Related documentation
diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 29db469..8160547 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -8,14 +8,28 @@ variable "message" {
   description = "Message sent when a monitor is triggered"
 }
 
-# Global DataDog
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
-  default     = "true"
+variable "subscription_id" {
+  description = "Azure account id used as filter for monitors"
+  type = "string"
 }
 
-variable "notify_no_data" {
-  default = "false"
+variable "provider" {
+  description = "Cloud provider which the monitor and its based metric depend on"
+  type = "string"
+  default = "azure"
+}
+
+variable "service" {
+  description = "Service monitored by this set of monitors"
+  type = "string"
+  default = "storage"
+}
+
+# Global DataDog
+
+
+variable "message" {
+  description = "Message sent when a Redis monitor is triggered"
 }
 
 variable "delay" {
@@ -23,7 +37,12 @@ variable "delay" {
   default     = 600
 }
 
-# Monitor specific
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
+
+# Azure Stream Analytics specific
 variable "su_utilization_threshold_warning" {
   default = 60
 }
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 6e6f651..e464dd4 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -2,12 +2,12 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}"
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
   }
 }
 
 resource "datadog_monitor" "su_utilization" {
-  name    = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
+  name    = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -31,10 +31,12 @@ resource "datadog_monitor" "su_utilization" {
     warning  = "${var.su_utilization_threshold_warning}"
     critical = "${var.su_utilization_threshold_critical}"
   }
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "failed_function_requests" {
-  name    = "[${var.environment}] Stream Analytics : More than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
+  name    = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -58,10 +60,12 @@ resource "datadog_monitor" "failed_function_requests" {
     warning  = "${var.function_requests_threshold_warning}"
     critical = "${var.function_requests_threshold_critical}"
   }
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "conversion_errors" {
-  name    = "[${var.environment}] Stream Analytics : More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
+  name    = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -85,10 +89,12 @@ resource "datadog_monitor" "conversion_errors" {
     warning  = "${var.conversion_errors_threshold_warning}"
     critical = "${var.conversion_errors_threshold_critical}"
   }
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "runtime_errors" {
-  name    = "[${var.environment}] Stream Analytics : More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
+  name    = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -112,4 +118,6 @@ resource "datadog_monitor" "runtime_errors" {
     warning  = "${var.runtime_errors_threshold_warning}"
     critical = "${var.runtime_errors_threshold_critical}"
   }
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }

From efad440d62068ed4cbee85d914579c18f5c961f3 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:28:05 +0100
Subject: [PATCH 27/93] MON-78 add subscription_id and tags

---
 cloud/azure/stream-analytics/README.md                |  9 ++++-----
 cloud/azure/stream-analytics/inputs.tf                | 10 ++++++++--
 .../stream-analytics/monitors-stream-analytics.tf     | 11 +++++++----
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md
index f115e70..28e3e2b 100644
--- a/cloud/azure/stream-analytics/README.md
+++ b/cloud/azure/stream-analytics/README.md
@@ -19,14 +19,13 @@ Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
-| conversion_errors_threshold_critical |  | string | `10` | no |
-| conversion_errors_threshold_warning |  | string | `0` | no |
+| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
+| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| function_requests_threshold_critical |  | string | `10` | no |
-| function_requests_threshold_warning |  | string | `0` | no |
+| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
+| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
-| notify_no_data |  | string | `false` | no |
 | runtime_errors_threshold_critical |  | string | `10` | no |
 | runtime_errors_threshold_warning |  | string | `0` | no |
 | su_utilization_threshold_critical |  | string | `80` | no |
diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 8160547..16807c8 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -26,8 +26,6 @@ variable "service" {
 }
 
 # Global DataDog
-
-
 variable "message" {
   description = "Message sent when a Redis monitor is triggered"
 }
@@ -44,33 +42,41 @@ variable "use_filter_tags" {
 
 # Azure Stream Analytics specific
 variable "su_utilization_threshold_warning" {
+  description = "Streaming Unit utilization rate limit (warning threshold)"
   default = 60
 }
 
 variable "su_utilization_threshold_critical" {
+  description = "Streaming Unit utilization rate limit (critical threshold)"
   default = 80
 }
 
 variable "function_requests_threshold_warning" {
+  description = "Failed Function Request rate limit (warning threshold)"
   default = 0
 }
 
 variable "function_requests_threshold_critical" {
+  description = "Failed Function Request rate limit (critical threshold)"
   default = 10
 }
 
 variable "conversion_errors_threshold_warning" {
+  description = "Conversion errors limit (warning threshold)"
   default = 0
 }
 
 variable "conversion_errors_threshold_critical" {
+  description = "Conversion errors limit (critical threshold)"
   default = 10
 }
 
 variable "runtime_errors_threshold_warning" {
+  description = "Runtime errors limit (warning threshold)"
   default = 0
 }
 
 variable "runtime_errors_threshold_critical" {
+  description = "Runtime errors limit (critical threshold)"
   default = 10
 }
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index e464dd4..0ecb513 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -41,8 +41,9 @@ resource "datadog_monitor" "failed_function_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}
-    ) > ${var.function_requests_threshold_critical}
+      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+    ) * 100 > ${var.function_requests_threshold_critical}
   EOF
   type  = "query alert"
 
@@ -66,7 +67,8 @@ resource "datadog_monitor" "failed_function_requests" {
 
 resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
-  message = "${var.message}"
+  # Hard Coded Message while we don't know how to configure warning and critical thresholds
+  message = "@FR-CloudPublic-run@fr.clara.net"
 
   query = <<EOF
     avg(last_5m): (
@@ -95,7 +97,8 @@ resource "datadog_monitor" "conversion_errors" {
 
 resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
-  message = "${var.message}"
+  # Hard Coded Message while we don't know how to configure warning and critical thresholds
+  message = "@FR-CloudPublic-run@fr.clara.net"
 
   query = <<EOF
     avg(last_5m): (

From 4f2d9bd6943231e0f980e02dad53ae85098c795b Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:35:35 +0100
Subject: [PATCH 28/93] MON-80 add tags

---
 cloud/azure/iothubs/README.md           |  6 +-
 cloud/azure/iothubs/inputs.tf           | 34 +++++----
 cloud/azure/iothubs/monitors-iothubs.tf | 96 ++++++++++++++++---------
 3 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index 3d6bb91..a0e4be5 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -9,10 +9,7 @@ module "iothubs" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
   
   message         = "${module.datadog-message-alerting.alerting-message}"
-   
   environment     = "${var.environment}"
-  stack           = "${var.stack}"
-  client_name     = "${var.client_name}"
   subscription_id = "${var.subscription_id}"
 }
 ```
@@ -48,7 +45,6 @@ Inputs
 | c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
 | c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
 | c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
-| client_name | Client Name | string | - | yes |
 | d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no |
 | d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no |
 | d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no |
@@ -77,4 +73,4 @@ Related documentation
 
 DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
 
-Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)
\ No newline at end of file
+Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)
diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index d04d03b..1efabc3 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -1,23 +1,26 @@
+# Global Terraform
 variable "environment" {
   description = "Architecture Environment"
   type        = "string"
 }
 
-variable "client_name" {
-  description = "Client Name"
-  type        = "string"
-}
-
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
-  default     = "true"
-}
-
 variable "subscription_id" {
-  description = "Subscription ID used to tag monitors"
-  type        = "string"
+  description = "Azure account id used as filter for monitors"
+  type = "string"
 }
 
+variable "provider" {
+  description = "Cloud provider which the monitor and its based metric depend on"
+  type = "string"
+  default = "azure"
+}
+
+variable "service" {
+  description = "Service monitored by this set of monitors"
+  type = "string"
+  default = "storage"
+
+# Global DataDog
 variable "delay" {
   description = "Delay in seconds for the metric evaluation"
   default = 600
@@ -27,7 +30,12 @@ variable "message" {
   description = "Message sent when an alert is triggered"
 }
 
-## IOT hubs
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
+
+# Azure IOT hubs specific
 variable "jobs_failed_threshold_warning" {
   description = "Jobs Failed rate limit (warning threshold)"
   default = 0
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 1ee29a3..4398f5f 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -2,7 +2,7 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}"
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}"
   }
 }
 
@@ -12,9 +12,9 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
   query = <<EOF
           avg(last_5m):(
-            avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-            ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-                avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+            avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+            ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+                avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
           ) * 100 > ${var.jobs_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -34,6 +34,8 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
@@ -42,9 +44,9 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
   query = <<EOF
           avg(last_5m):(
-            avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
-              ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
-                  avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
+            avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
+              ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
+                  avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
           ) * 100 > ${var.listjobs_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -64,6 +66,8 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
@@ -72,9 +76,9 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
-        ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
-            avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
+      avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
+        ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
+            avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
     ) * 100 > ${var.queryjobs_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -94,6 +98,8 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "status" {
@@ -101,7 +107,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1
+    avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
   EOF
   type  = "query alert"
 
@@ -115,6 +121,8 @@ resource "datadog_monitor" "status" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "total_devices" {
@@ -122,7 +130,7 @@ resource "datadog_monitor" "total_devices" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0
+    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0
   EOF
   type  = "query alert"
 
@@ -136,6 +144,8 @@ resource "datadog_monitor" "total_devices" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_methods_failed" {
@@ -144,9 +154,9 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-            avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+            avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
     ) * 100 > ${var.c2d_methods_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -166,6 +176,8 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
@@ -174,9 +186,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-            avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+            avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
     ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -196,6 +208,8 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
@@ -204,9 +218,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-      ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+      ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
     ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -226,6 +240,8 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
@@ -234,9 +250,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
     ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -256,6 +272,8 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
@@ -264,9 +282,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
 
   query = <<EOF
     avg(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
     ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}
   EOF
   type  = "query alert"
@@ -286,6 +304,8 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
@@ -294,7 +314,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
 
   query = <<EOF
       sum(last_5m): (
-        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
       ) > ${var.d2c_telemetry_egress_dropped_threshold_critical}
   EOF
   type  = "query alert"
@@ -314,6 +334,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
@@ -322,7 +344,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     ) > ${var.d2c_telemetry_egress_orphaned_threshold_critical}
   EOF
   type  = "query alert"
@@ -342,6 +364,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
@@ -350,7 +374,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     ) > ${var.d2c_telemetry_egress_invalid_threshold_critical}
   EOF
   type  = "query alert"
@@ -370,6 +394,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
@@ -378,7 +404,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     )  > ${var.d2c_telemetry_egress_fallback_threshold_critical}
   EOF
   type  = "query alert"
@@ -398,6 +424,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
@@ -406,8 +434,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() -
-        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() -
+        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     ) > 0
   EOF
   type  = "query alert"
@@ -422,4 +450,6 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }

From 2593e5fac4e9da58531e7195af53e5c62802c424 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:47:29 +0100
Subject: [PATCH 29/93] MON-80 update readme

---
 cloud/azure/iothubs/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index a0e4be5..362e226 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -64,8 +64,10 @@ Inputs
 | listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
 | listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
+| provider | What is the monitored provider | string | azure | no |
 | queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
 | queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
+| service | What is the monitored service | string | storage | no |
 | subscription_id | Subscription ID used to tag monitors | string | - | yes |
 
 Related documentation

From 31f033c35d49109f39bc2e00337c78003fec721e Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:51:18 +0100
Subject: [PATCH 30/93] MON-78 update readme

---
 cloud/azure/stream-analytics/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md
index 28e3e2b..dca299b 100644
--- a/cloud/azure/stream-analytics/README.md
+++ b/cloud/azure/stream-analytics/README.md
@@ -26,10 +26,12 @@ Inputs
 | function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
 | function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
+| provider | What is the monitored provider | string | azure | no |
 | runtime_errors_threshold_critical |  | string | `10` | no |
 | runtime_errors_threshold_warning |  | string | `0` | no |
 | su_utilization_threshold_critical |  | string | `80` | no |
 | su_utilization_threshold_warning | Monitor specific | string | `60` | no |
+| service | What is the monitored service | string | storage | no |
 | subscription_id | Azure account id used as filter for monitors | string | - | yes |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 

From e0fa47008ae60aa0ba97eb6b2c33d40b0c2e596a Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Mon, 6 Nov 2017 10:30:00 +0100
Subject: [PATCH 31/93] MON-80 Update variables' names

---
 cloud/azure/iothubs/README.md           |  57 ++++-----
 cloud/azure/iothubs/inputs.tf           | 125 ++++++++++----------
 cloud/azure/iothubs/monitors-iothubs.tf | 147 +++++++++++++-----------
 3 files changed, 173 insertions(+), 156 deletions(-)

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index 362e226..339b357 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -1,5 +1,5 @@
 Azure IOT Hubs DataDog monitors
-============================
+===============================
 
 How to use this module
 ----------------------
@@ -39,36 +39,37 @@ Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
-| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
-| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
-| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
-| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
-| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
-| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
-| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no |
-| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no |
-| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no |
-| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no |
-| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no |
-| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no |
-| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no |
-| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no |
-| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
-| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
-| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
-| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
+| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
 | environment | Architecture Environment | string | - | yes |
-| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
-| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
-| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
-| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
+| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
+| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
+| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
+| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
+| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
+| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
+| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
+| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
+| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
+| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
+| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
+| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
-| provider | What is the monitored provider | string | azure | no |
-| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
-| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
-| service | What is the monitored service | string | storage | no |
-| subscription_id | Subscription ID used to tag monitors | string | - | yes |
+| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
+| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
+| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no |
+| service | Service monitored by this set of monitors | string | `storage` | no |
+| subscription_id | Azure account id used as filter for monitors | string | - | yes |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 1efabc3..01c77fb 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -6,24 +6,25 @@ variable "environment" {
 
 variable "subscription_id" {
   description = "Azure account id used as filter for monitors"
-  type = "string"
+  type        = "string"
 }
 
 variable "provider" {
   description = "Cloud provider which the monitor and its based metric depend on"
-  type = "string"
-  default = "azure"
+  type        = "string"
+  default     = "azure"
 }
 
 variable "service" {
   description = "Service monitored by this set of monitors"
-  type = "string"
-  default = "storage"
+  type        = "string"
+  default     = "storage"
+}
 
 # Global DataDog
 variable "delay" {
   description = "Delay in seconds for the metric evaluation"
-  default = 600
+  default     = 600
 }
 
 variable "message" {
@@ -36,122 +37,122 @@ variable "use_filter_tags" {
 }
 
 # Azure IOT hubs specific
-variable "jobs_failed_threshold_warning" {
+variable "failed_jobs_rate_threshold_warning" {
   description = "Jobs Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "jobs_failed_threshold_critical" {
+variable "failed_jobs_rate_threshold_critical" {
   description = "Jobs Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "listjobs_failed_threshold_warning" {
+variable "failed_listjobs_rate_threshold_warning" {
   description = "ListJobs Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "listjobs_failed_threshold_critical" {
+variable "failed_listjobs_rate_threshold_critical" {
   description = "ListJobs Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "queryjobs_failed_threshold_warning" {
+variable "failed_queryjobs_rate_threshold_warning" {
   description = "QueryJobs Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "queryjobs_failed_threshold_critical" {
+variable "failed_queryjobs_rate_threshold_critical" {
   description = "QueryJobs Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "c2d_methods_failed_threshold_warning" {
+variable "failed_c2d_methods_rate_threshold_warning" {
   description = "C2D Methods Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "c2d_methods_failed_threshold_critical" {
+variable "failed_c2d_methods_rate_threshold_critical" {
   description = "C2D Methods Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "c2d_twin_read_failed_threshold_warning" {
+variable "failed_c2d_twin_read_rate_threshold_warning" {
   description = "C2D Twin Read Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "c2d_twin_read_failed_threshold_critical" {
+variable "failed_c2d_twin_read_rate_threshold_critical" {
   description = "C2D Twin Read Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "c2d_twin_update_failed_threshold_warning" {
+variable "failed_c2d_twin_update_rate_threshold_warning" {
   description = "C2D Twin Update Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "c2d_twin_update_failed_threshold_critical" {
+variable "failed_c2d_twin_update_rate_threshold_critical" {
   description = "C2D Twin Update Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "d2c_twin_read_failed_threshold_warning" {
+variable "failed_d2c_twin_read_rate_threshold_warning" {
   description = "D2C Twin Read Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "d2c_twin_read_failed_threshold_critical" {
+variable "failed_d2c_twin_read_rate_threshold_critical" {
   description = "D2C Twin Read Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "d2c_twin_update_failed_threshold_warning" {
+variable "failed_d2c_twin_update_rate_threshold_warning" {
   description = "D2C Twin Update Failed rate limit (warning threshold)"
-  default = 0
+  default     = 0
 }
 
-variable "d2c_twin_update_failed_threshold_critical" {
+variable "failed_d2c_twin_update_rate_threshold_critical" {
   description = "D2C Twin Update Failed rate limit (critical threshold)"
-  default = 10
+  default     = 10
 }
 
-variable "d2c_telemetry_egress_dropped_threshold_warning" {
-  description = "D2C Telemetry Dropped Failed limit (warning threshold)"
-  default = 500
+variable "dropped_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Dropped limit (warning threshold)"
+  default     = 500
 }
 
-variable "d2c_telemetry_egress_dropped_threshold_critical" {
-  description = "D2C Telemetry Dropped Failed limit (critical threshold)"
-  default = 1000
+variable "dropped_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Dropped limit (critical threshold)"
+  default     = 1000
 }
 
-variable "d2c_telemetry_egress_orphaned_threshold_warning" {
-  description = "D2C Telemetry Orphaned Failed limit (warning threshold)"
-  default = 500
+variable "orphaned_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Orphaned limit (warning threshold)"
+  default     = 500
 }
 
-variable "d2c_telemetry_egress_orphaned_threshold_critical" {
-  description = "D2C Telemetry Orphaned Failed limit (critical threshold)"
-  default = 1000
+variable "orphaned_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Orphaned limit (critical threshold)"
+  default     = 1000
 }
 
-variable "d2c_telemetry_egress_invalid_threshold_warning" {
-  description = "D2C Telemetry Invalid Failed limit (warning threshold)"
-  default = 500
+variable "invalid_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Invalid limit (warning threshold)"
+  default     = 500
 }
 
-variable "d2c_telemetry_egress_invalid_threshold_critical" {
-  description = "D2C Telemetry Invalid Failed limit (critical threshold)"
-  default = 1000
+variable "invalid_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Invalid limit (critical threshold)"
+  default     = 1000
 }
 
-variable "d2c_telemetry_egress_fallback_threshold_warning" {
-  description = "D2C Telemetry Fallback Failed limit (warning threshold)"
-  default = 500
+variable "fallback_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Fallback limit (warning threshold)"
+  default     = 500
 }
 
-variable "d2c_telemetry_egress_fallback_threshold_critical" {
-  description = "D2C Telemetry Fallback Failed limit (critical threshold)"
-  default = 1000
+variable "fallback_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Fallback limit (critical threshold)"
+  default     = 1000
 }
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 4398f5f..d7fb7e3 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -15,13 +15,14 @@ resource "datadog_monitor" "too_many_jobs_failed" {
             avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
             ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
                 avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-          ) * 100 > ${var.jobs_failed_threshold_critical}
+          ) * 100 > ${var.failed_jobs_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.jobs_failed_threshold_warning}"
-    critical = "${var.jobs_failed_threshold_critical}"
+    warning  = "${var.failed_jobs_rate_threshold_warning}"
+    critical = "${var.failed_jobs_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -35,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
@@ -47,13 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
             avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
               ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
                   avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
-          ) * 100 > ${var.listjobs_failed_threshold_critical}
+          ) * 100 > ${var.failed_listjobs_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.listjobs_failed_threshold_warning}"
-    critical = "${var.listjobs_failed_threshold_critical}"
+    warning  = "${var.failed_listjobs_rate_threshold_warning}"
+    critical = "${var.failed_listjobs_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -67,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
@@ -79,13 +81,14 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
       avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
         ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
             avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
-    ) * 100 > ${var.queryjobs_failed_threshold_critical}
+    ) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.queryjobs_failed_threshold_warning}"
-    critical = "${var.queryjobs_failed_threshold_critical}"
+    warning  = "${var.failed_queryjobs_rate_threshold_warning}"
+    critical = "${var.failed_queryjobs_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -99,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "status" {
@@ -109,7 +112,8 @@ resource "datadog_monitor" "status" {
   query = <<EOF
     avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
@@ -122,7 +126,7 @@ resource "datadog_monitor" "status" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "total_devices" {
@@ -132,7 +136,8 @@ resource "datadog_monitor" "total_devices" {
   query = <<EOF
     avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
@@ -145,7 +150,7 @@ resource "datadog_monitor" "total_devices" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_methods_failed" {
@@ -157,13 +162,14 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
       avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
             avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-    ) * 100 > ${var.c2d_methods_failed_threshold_critical}
+    ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.c2d_methods_failed_threshold_warning}"
-    critical = "${var.c2d_methods_failed_threshold_critical}"
+    warning  = "${var.failed_c2d_methods_rate_threshold_warning}"
+    critical = "${var.failed_c2d_methods_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -177,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
@@ -189,13 +195,14 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
       avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
             avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-    ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}
+    ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.c2d_twin_read_failed_threshold_warning}"
-    critical = "${var.c2d_twin_read_failed_threshold_critical}"
+    warning  = "${var.failed_c2d_twin_read_rate_threshold_warning}"
+    critical = "${var.failed_c2d_twin_read_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -209,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
@@ -221,13 +228,14 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
       avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
       ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-    ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}
+    ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.c2d_twin_update_failed_threshold_warning}"
-    critical = "${var.c2d_twin_update_failed_threshold_critical}"
+    warning  = "${var.failed_c2d_twin_update_rate_threshold_warning}"
+    critical = "${var.failed_c2d_twin_update_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -241,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
@@ -253,13 +261,14 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
       avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-    ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}
+    ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_twin_read_failed_threshold_warning}"
-    critical = "${var.d2c_twin_read_failed_threshold_critical}"
+    warning  = "${var.failed_d2c_twin_read_rate_threshold_warning}"
+    critical = "${var.failed_d2c_twin_read_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -273,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
@@ -285,13 +294,14 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
       avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
-    ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}
+    ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_twin_update_failed_threshold_warning}"
-    critical = "${var.d2c_twin_update_failed_threshold_critical}"
+    warning  = "${var.failed_d2c_twin_update_rate_threshold_warning}"
+    critical = "${var.failed_d2c_twin_update_rate_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -305,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
@@ -315,13 +325,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
   query = <<EOF
       sum(last_5m): (
         avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
-      ) > ${var.d2c_telemetry_egress_dropped_threshold_critical}
+      ) > ${var.dropped_d2c_telemetry_egress_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_telemetry_egress_dropped_threshold_warning}"
-    critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}"
+    warning  = "${var.dropped_d2c_telemetry_egress_threshold_warning}"
+    critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -335,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
@@ -345,13 +356,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
   query = <<EOF
     sum(last_5m): (
       avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
-    ) > ${var.d2c_telemetry_egress_orphaned_threshold_critical}
+    ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_telemetry_egress_orphaned_threshold_warning}"
-    critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}"
+    warning  = "${var.orphaned_d2c_telemetry_egress_threshold_warning}"
+    critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -365,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
@@ -375,13 +387,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
   query = <<EOF
     sum(last_5m): (
       avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
-    ) > ${var.d2c_telemetry_egress_invalid_threshold_critical}
+    ) > ${var.invalid_d2c_telemetry_egress_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_telemetry_egress_invalid_threshold_warning}"
-    critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}"
+    warning  = "${var.invalid_d2c_telemetry_egress_threshold_warning}"
+    critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -395,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
@@ -405,13 +418,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
   query = <<EOF
     sum(last_5m): (
       avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
-    )  > ${var.d2c_telemetry_egress_fallback_threshold_critical}
+    )  > ${var.fallback_d2c_telemetry_egress_threshold_critical}
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
-    warning  = "${var.d2c_telemetry_egress_fallback_threshold_warning}"
-    critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}"
+    warning  = "${var.fallback_d2c_telemetry_egress_threshold_warning}"
+    critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}"
   }
 
   notify_no_data      = false
@@ -425,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
@@ -438,7 +452,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
         avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     ) > 0
   EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
@@ -451,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
 }

From 279778ed888f891f8a30d033b004275757c904ff Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 15:12:54 +0100
Subject: [PATCH 32/93] MON-80 Normalize monitors

---
 cloud/azure/iothubs/README.md           | 11 ++---
 cloud/azure/iothubs/inputs.tf           | 26 +++--------
 cloud/azure/iothubs/monitors-iothubs.tf | 62 ++++++++++++-------------
 3 files changed, 42 insertions(+), 57 deletions(-)

diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index 339b357..5187715 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -8,9 +8,8 @@ How to use this module
 module "iothubs" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
   
-  message         = "${module.datadog-message-alerting.alerting-message}"
-  environment     = "${var.environment}"
-  subscription_id = "${var.subscription_id}"
+  message     = "${module.datadog-message-alerting.alerting-message}"
+  environment = "${var.environment}"
 }
 ```
 
@@ -61,15 +60,13 @@ Inputs
 | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
 | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
 | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
 | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
 | orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
 | orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
-| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no |
-| service | Service monitored by this set of monitors | string | `storage` | no |
-| subscription_id | Azure account id used as filter for monitors | string | - | yes |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 01c77fb..1b1348f 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -4,23 +4,6 @@ variable "environment" {
   type        = "string"
 }
 
-variable "subscription_id" {
-  description = "Azure account id used as filter for monitors"
-  type        = "string"
-}
-
-variable "provider" {
-  description = "Cloud provider which the monitor and its based metric depend on"
-  type        = "string"
-  default     = "azure"
-}
-
-variable "service" {
-  description = "Service monitored by this set of monitors"
-  type        = "string"
-  default     = "storage"
-}
-
 # Global DataDog
 variable "delay" {
   description = "Delay in seconds for the metric evaluation"
@@ -31,11 +14,16 @@ variable "message" {
   description = "Message sent when an alert is triggered"
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
   default     = "true"
 }
 
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
 # Azure IOT hubs specific
 variable "failed_jobs_rate_threshold_warning" {
   description = "Jobs Failed rate limit (warning threshold)"
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index d7fb7e3..6e1f926 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -2,7 +2,7 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
@@ -18,7 +18,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
           ) * 100 > ${var.failed_jobs_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_jobs_rate_threshold_warning}"
@@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_list_jobs_failed" {
@@ -51,7 +51,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
           ) * 100 > ${var.failed_listjobs_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_listjobs_rate_threshold_warning}"
@@ -69,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_query_jobs_failed" {
@@ -84,7 +84,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
     ) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_queryjobs_rate_threshold_warning}"
@@ -102,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "status" {
@@ -113,7 +113,7 @@ resource "datadog_monitor" "status" {
     avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
@@ -126,7 +126,7 @@ resource "datadog_monitor" "status" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "total_devices" {
@@ -137,7 +137,7 @@ resource "datadog_monitor" "total_devices" {
     avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
@@ -150,7 +150,7 @@ resource "datadog_monitor" "total_devices" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_c2d_methods_failed" {
@@ -165,7 +165,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
     ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_c2d_methods_rate_threshold_warning}"
@@ -183,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
@@ -198,7 +198,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
     ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_c2d_twin_read_rate_threshold_warning}"
@@ -216,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
@@ -231,7 +231,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
     ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_c2d_twin_update_rate_threshold_warning}"
@@ -249,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
@@ -264,7 +264,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
     ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_d2c_twin_read_rate_threshold_warning}"
@@ -282,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
@@ -297,7 +297,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
     ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.failed_d2c_twin_update_rate_threshold_warning}"
@@ -315,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
@@ -328,7 +328,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
       ) > ${var.dropped_d2c_telemetry_egress_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.dropped_d2c_telemetry_egress_threshold_warning}"
@@ -346,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
@@ -359,7 +359,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
     ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.orphaned_d2c_telemetry_egress_threshold_warning}"
@@ -377,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
@@ -390,7 +390,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
     ) > ${var.invalid_d2c_telemetry_egress_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.invalid_d2c_telemetry_egress_threshold_warning}"
@@ -408,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
@@ -421,7 +421,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
     )  > ${var.fallback_d2c_telemetry_egress_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.fallback_d2c_telemetry_egress_threshold_warning}"
@@ -439,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
@@ -453,7 +453,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
     ) > 0
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
@@ -466,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
 }

From 8afae8b5f44cf60a04a4a6c22e6da5414553a129 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 15:51:23 +0100
Subject: [PATCH 33/93] MON-78 Normalize monitors & add status monitor

---
 cloud/azure/stream-analytics/inputs.tf        | 32 +++------
 .../monitors-stream-analytics.tf              | 69 ++++++++++++-------
 2 files changed, 53 insertions(+), 48 deletions(-)

diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf
index 16807c8..ae1186a 100644
--- a/cloud/azure/stream-analytics/inputs.tf
+++ b/cloud/azure/stream-analytics/inputs.tf
@@ -4,27 +4,6 @@ variable "environment" {
   type        = "string"
 }
 
-variable "message" {
-  description = "Message sent when a monitor is triggered"
-}
-
-variable "subscription_id" {
-  description = "Azure account id used as filter for monitors"
-  type = "string"
-}
-
-variable "provider" {
-  description = "Cloud provider which the monitor and its based metric depend on"
-  type = "string"
-  default = "azure"
-}
-
-variable "service" {
-  description = "Service monitored by this set of monitors"
-  type = "string"
-  default = "storage"
-}
-
 # Global DataDog
 variable "message" {
   description = "Message sent when a Redis monitor is triggered"
@@ -35,11 +14,16 @@ variable "delay" {
   default     = 600
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
   default     = "true"
 }
 
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
 # Azure Stream Analytics specific
 variable "su_utilization_threshold_warning" {
   description = "Streaming Unit utilization rate limit (warning threshold)"
@@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" {
   default = 0
 }
 
-variable "function_requests_threshold_critical" {
+variable "failed_function_requests_threshold_critical" {
   description = "Failed Function Request rate limit (critical threshold)"
   default = 10
 }
diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 0ecb513..f72af1f 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -2,12 +2,35 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
+resource "datadog_monitor" "status" {
+  name    = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
+  EOF
+  type  = "metric alert"
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
+}
+
 resource "datadog_monitor" "su_utilization" {
-  name    = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
+  name    = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -15,11 +38,11 @@ resource "datadog_monitor" "su_utilization" {
       avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
     ) > ${var.su_utilization_threshold_critical}
   EOF
-  type  = "query alert"
+  type  = "metric alert"
 
-  notify_no_data      = "${var.notify_no_data}"
+  notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" {
     critical = "${var.su_utilization_threshold_critical}"
   }
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "failed_function_requests" {
-  name    = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
+  name    = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
     avg(last_5m): (
       avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
        avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
-    ) * 100 > ${var.function_requests_threshold_critical}
+    ) * 100 > ${var.failed_function_requests_threshold_critical}
   EOF
-  type  = "query alert"
+  type  = "metric alert"
 
-  notify_no_data      = "${var.notify_no_data}"
+  notify_no_data      = false
   evaluation_delay    = "${var.delay}"
   renotify_interval   = 60
   notify_audit        = false
@@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" {
   no_data_timeframe   = 20
   thresholds {
     warning  = "${var.function_requests_threshold_warning}"
-    critical = "${var.function_requests_threshold_critical}"
+    critical = "${var.failed_function_requests_threshold_critical}"
   }
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "conversion_errors" {
   name    = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
-  # Hard Coded Message while we don't know how to configure warning and critical thresholds
-  message = "@FR-CloudPublic-run@fr.clara.net"
+  message = "${var.message}"
 
   query = <<EOF
     avg(last_5m): (
       avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
     ) > ${var.conversion_errors_threshold_critical}
   EOF
-  type  = "query alert"
+  type  = "metric alert"
 
-  notify_no_data      = "${var.notify_no_data}"
+  notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" {
     critical = "${var.conversion_errors_threshold_critical}"
   }
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "runtime_errors" {
   name    = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
-  # Hard Coded Message while we don't know how to configure warning and critical thresholds
-  message = "@FR-CloudPublic-run@fr.clara.net"
+  message = "${var.message}"
 
   query = <<EOF
     avg(last_5m): (
       avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
     ) > ${var.runtime_errors_threshold_critical}
   EOF
-  type  = "query alert"
+  type  = "metric alert"
 
-  notify_no_data      = "${var.notify_no_data}"
+  notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" {
     critical = "${var.runtime_errors_threshold_critical}"
   }
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
 }

From d2e1aa5efddea62258790c9b0afff8dea0d51cf4 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 30 Oct 2017 11:40:16 +0100
Subject: [PATCH 34/93] MON-77 Azure Event Hub monitors

---
 cloud/azure/eventhub/inputs.tf            | 31 ++++++++
 cloud/azure/eventhub/monitors-eventhub.tf | 86 +++++++++++++++++++++++
 cloud/azure/eventhub/outputs.tf           | 11 +++
 3 files changed, 128 insertions(+)
 create mode 100644 cloud/azure/eventhub/inputs.tf
 create mode 100644 cloud/azure/eventhub/monitors-eventhub.tf
 create mode 100644 cloud/azure/eventhub/outputs.tf

diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf
new file mode 100644
index 0000000..a1c7ec4
--- /dev/null
+++ b/cloud/azure/eventhub/inputs.tf
@@ -0,0 +1,31 @@
+variable "environment" {}
+
+variable "down_message" {}
+
+variable "failed_requests_message" {}
+
+variable "errors_message" {}
+
+variable "delay" {
+  default = 600
+}
+
+variable "failed_requests_rate_thresold_critical" {
+  default = 5
+}
+
+variable "failed_requests_rate_thresold_warning" {
+  default = 3
+}
+
+variable "errors_rate_thresold_critical" {
+  default = 5
+}
+
+variable "errors_rate_thresold_warning" {
+  default = 3
+}
+
+variable "use_filter_tags" {
+  default = "true"
+}
diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
new file mode 100644
index 0000000..7c22418
--- /dev/null
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -0,0 +1,86 @@
+resource "datadog_monitor" "eventhub_status" {
+  name    = "[${var.environment}] Event Hub status"
+  message = "${var.down_message}"
+
+  query = <<EOF
+      avg(last_5m): avg:azure.eventhub_namespaces.status{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} != 1
+      EOF
+  type  = "query alert"
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "eventhub_failed_requests" {
+  name    = "[${var.environment}] Event Hub failed requests"
+  message = "${var.failed_requests_message}"
+
+  query = <<EOF
+        avg(last_5m): (
+          avg:azure.eventhub_namespaces.failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+        ) * 100 / (
+          avg:azure.eventhub_namespaces.successful_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+        ) > ${var.failed_requests_rate_thresold_critical}
+        EOF
+  type  = "query alert"
+
+  thresholds {
+    critical = "${var.failed_requests_rate_thresold_critical}"
+    warning  = "${var.failed_requests_rate_thresold_warning}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "eventhub_errors" {
+  name    = "[${var.environment}] Event Hub errors"
+  message = "${var.errors_message}"
+
+  query = <<EOF
+        avg(last_5m): (
+          avg:azure.eventhub_namespaces.internal_server_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+        ) * 100 / (
+          avg:azure.eventhub_namespaces.successful_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.internal_server_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+        ) > ${var.errors_rate_thresold_critical}
+        EOF
+  type  = "query alert"
+
+  thresholds {
+    critical = "${var.errors_rate_thresold_critical}"
+    warning  = "${var.errors_rate_thresold_warning}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
diff --git a/cloud/azure/eventhub/outputs.tf b/cloud/azure/eventhub/outputs.tf
new file mode 100644
index 0000000..b9d1822
--- /dev/null
+++ b/cloud/azure/eventhub/outputs.tf
@@ -0,0 +1,11 @@
+output "status_monitor_id" {
+  value = "${datadog_monitor.eventhub_failed_requests.id}"
+}
+
+output "failed_requests_monitor_id" {
+  value = "${datadog_monitor.eventhub_status.id}"
+}
+
+output "errors_monitor_id" {
+  value = "${datadog_monitor.eventhub_errors.id}"
+}

From 15549efc52e50e03a0b2d5165bdf41a121607947 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 30 Oct 2017 17:49:02 +0100
Subject: [PATCH 35/93] MON-77 Use data template for tag filter

---
 cloud/azure/eventhub/monitors-eventhub.tf | 31 +++++++++++++++--------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 7c22418..71b97b3 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -1,9 +1,18 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}"
+  }
+}
+
+
 resource "datadog_monitor" "eventhub_status" {
   name    = "[${var.environment}] Event Hub status"
   message = "${var.down_message}"
 
   query = <<EOF
-      avg(last_5m): avg:azure.eventhub_namespaces.status{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} != 1
+      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered} by {name,resource_group,region} != 1
       EOF
   type  = "query alert"
 
@@ -25,10 +34,10 @@ resource "datadog_monitor" "eventhub_failed_requests" {
 
   query = <<EOF
         avg(last_5m): (
-          avg:azure.eventhub_namespaces.failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered} by {name,resource_group,region}
         ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered} by {name,resource_group,region}
         ) > ${var.failed_requests_rate_thresold_critical}
         EOF
   type  = "query alert"
@@ -56,14 +65,14 @@ resource "datadog_monitor" "eventhub_errors" {
 
   query = <<EOF
         avg(last_5m): (
-          avg:azure.eventhub_namespaces.internal_server_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered} by {name,resource_group,region}
         ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.internal_server_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered} by {name,resource_group,region}
         ) > ${var.errors_rate_thresold_critical}
         EOF
   type  = "query alert"

From 3330aeb9dcb6574deea57d671fcf5faa9cfa528e Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 30 Oct 2017 18:00:12 +0100
Subject: [PATCH 36/93] MON-77 Fix tag filters

---
 cloud/azure/eventhub/monitors-eventhub.tf | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 71b97b3..2b67590 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -12,7 +12,7 @@ resource "datadog_monitor" "eventhub_status" {
   message = "${var.down_message}"
 
   query = <<EOF
-      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered} by {name,resource_group,region} != 1
+      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
       EOF
   type  = "query alert"
 
@@ -34,10 +34,10 @@ resource "datadog_monitor" "eventhub_failed_requests" {
 
   query = <<EOF
         avg(last_5m): (
-          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) > ${var.failed_requests_rate_thresold_critical}
         EOF
   type  = "query alert"
@@ -65,14 +65,14 @@ resource "datadog_monitor" "eventhub_errors" {
 
   query = <<EOF
         avg(last_5m): (
-          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered} by {name,resource_group,region}
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) > ${var.errors_rate_thresold_critical}
         EOF
   type  = "query alert"

From 1768c1621f7fbcce64464c5aa4f19bd217fae538 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 30 Oct 2017 19:05:40 +0100
Subject: [PATCH 37/93] MON-77 Change monitor type to  to fix it

---
 cloud/azure/eventhub/monitors-eventhub.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 2b67590..7600215 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -14,7 +14,7 @@ resource "datadog_monitor" "eventhub_status" {
   query = <<EOF
       avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
       EOF
-  type  = "query alert"
+  type  = "metric alert"
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"

From 0f22d51e9ccc0671c80048cf1244fcc27d43f597 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Tue, 31 Oct 2017 08:51:34 +0100
Subject: [PATCH 38/93] MON-77 Some documentation & lower thresold levels

---
 cloud/azure/eventhub/README.md            | 53 +++++++++++++++++++++++
 cloud/azure/eventhub/inputs.tf            | 26 ++++++-----
 cloud/azure/eventhub/monitors-eventhub.tf | 12 ++---
 3 files changed, 74 insertions(+), 17 deletions(-)
 create mode 100644 cloud/azure/eventhub/README.md

diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md
new file mode 100644
index 0000000..a148377
--- /dev/null
+++ b/cloud/azure/eventhub/README.md
@@ -0,0 +1,53 @@
+Event Hub Datadog monitor
+=========================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-eventhub" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+}
+```
+
+Purpose
+-------
+Creates a Datadog monitor with the following checks :
+
+* Service status check
+* Failed request ratio
+* Erroneous requests ratio
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
+| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
+| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
+| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
+| message | Message sent when an alert is triggered | string | - | yes |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
+Outputs
+-------
+
+| Name | Description |
+|------|-------------|
+| errors_monitor_id | Id of the `errors` monitor |
+| failed_requests_monitor_id | Id of the `failed requests` monitor |
+| status_monitor_id | Id of the `status` monitor |
+
+Related documentation
+---------------------
+
+Datadog documentation : [https://docs.datadoghq.com/integrations/azure_event_hub/](https://docs.datadoghq.com/integrations/azure_event_hub/)
+
+Azure metrics documentation : [https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor)
diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf
index a1c7ec4..a67caae 100644
--- a/cloud/azure/eventhub/inputs.tf
+++ b/cloud/azure/eventhub/inputs.tf
@@ -1,31 +1,35 @@
 variable "environment" {}
 
-variable "down_message" {}
-
-variable "failed_requests_message" {}
-
-variable "errors_message" {}
+variable "message" {
+  description = "Message sent when an alert is triggered"
+}
 
 variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
   default = 600
 }
 
 variable "failed_requests_rate_thresold_critical" {
-  default = 5
+  description = "Failed requests ratio (percentage) to trigger the critical alert"
+  default = 3
 }
 
 variable "failed_requests_rate_thresold_warning" {
-  default = 3
+  description = "Failed requests ratio (percentage) to trigger a warning alert"
+  default = 1
 }
 
 variable "errors_rate_thresold_critical" {
-  default = 5
-}
-
-variable "errors_rate_thresold_warning" {
+  description = "Errors ratio (percentage) to trigger the critical alert"
   default = 3
 }
 
+variable "errors_rate_thresold_warning" {
+  description = "Errors ratio (percentage) to trigger a warning alert"
+  default = 1
+}
+
 variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
   default = "true"
 }
diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 7600215..efe1351 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -9,7 +9,7 @@ data "template_file" "filter" {
 
 resource "datadog_monitor" "eventhub_status" {
   name    = "[${var.environment}] Event Hub status"
-  message = "${var.down_message}"
+  message = "${var.message}"
 
   query = <<EOF
       avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
@@ -18,7 +18,7 @@ resource "datadog_monitor" "eventhub_status" {
 
   notify_no_data      = true
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -30,7 +30,7 @@ resource "datadog_monitor" "eventhub_status" {
 
 resource "datadog_monitor" "eventhub_failed_requests" {
   name    = "[${var.environment}] Event Hub failed requests"
-  message = "${var.failed_requests_message}"
+  message = "${var.message}"
 
   query = <<EOF
         avg(last_5m): (
@@ -49,7 +49,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -61,7 +61,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
 
 resource "datadog_monitor" "eventhub_errors" {
   name    = "[${var.environment}] Event Hub errors"
-  message = "${var.errors_message}"
+  message = "${var.message}"
 
   query = <<EOF
         avg(last_5m): (
@@ -84,7 +84,7 @@ resource "datadog_monitor" "eventhub_errors" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true

From 9e55ce3772db1e029cb51c7aedee2a4bce6042e5 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:41:57 +0100
Subject: [PATCH 39/93] MON-77 add tags and subscription_id

---
 cloud/azure/eventhub/README.md            |  5 +++-
 cloud/azure/eventhub/inputs.tf            | 34 +++++++++++++++++++----
 cloud/azure/eventhub/monitors-eventhub.tf | 11 ++++++--
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md
index a148377..6e40955 100644
--- a/cloud/azure/eventhub/README.md
+++ b/cloud/azure/eventhub/README.md
@@ -9,8 +9,8 @@ module "datadog-monitors-azure-eventhub" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}"
 
   message = "${module.datadog-message-alerting.alerting-message}"
-
   environment = "${var.environment}"
+  subscription_id = "${var.subscription_id}"
 }
 ```
 
@@ -29,12 +29,15 @@ Inputs
 |------|-------------|:----:|:-----:|:-----:|
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
+| provider | What is the monitored provider | string | - | yes |
+| service | What is the monitored service | string | - | yes |
 | errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
 | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
 | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
 | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+| subscription_id | Azure account id used as filter for monitors | string | - | yes |
 
 Outputs
 -------
diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf
index a67caae..d520dc2 100644
--- a/cloud/azure/eventhub/inputs.tf
+++ b/cloud/azure/eventhub/inputs.tf
@@ -1,5 +1,27 @@
-variable "environment" {}
+# Global Terraform
+variable "environment" {
+  description = "Architecture environment"
+  type = "string"
+}
 
+variable "subscription_id" {
+  description = "Azure account id used as filter for monitors"
+  type = "string"
+}
+
+variable "provider" {
+  description = "Cloud provider which the monitor and its based metric depend on"
+  type = "string"
+  default = "azure"
+}
+
+variable "service" {
+  description = "Service monitored by this set of monitors"
+  type = "string"
+  default = "storage"
+}
+
+# Global DataDog
 variable "message" {
   description = "Message sent when an alert is triggered"
 }
@@ -9,6 +31,11 @@ variable "delay" {
   default = 600
 }
 
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
+
 variable "failed_requests_rate_thresold_critical" {
   description = "Failed requests ratio (percentage) to trigger the critical alert"
   default = 3
@@ -28,8 +55,3 @@ variable "errors_rate_thresold_warning" {
   description = "Errors ratio (percentage) to trigger a warning alert"
   default = 1
 }
-
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
-  default = "true"
-}
diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index efe1351..89a3d8a 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -2,11 +2,10 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}"
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
   }
 }
 
-
 resource "datadog_monitor" "eventhub_status" {
   name    = "[${var.environment}] Event Hub status"
   message = "${var.message}"
@@ -26,6 +25,8 @@ resource "datadog_monitor" "eventhub_status" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "eventhub_failed_requests" {
@@ -57,6 +58,8 @@ resource "datadog_monitor" "eventhub_failed_requests" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "eventhub_errors" {
@@ -91,5 +94,7 @@ resource "datadog_monitor" "eventhub_errors" {
   locked              = false
   require_full_window = true
   new_host_delay      = "${var.delay}"
-  no_data_timeframe   = 20
+  no_data_timeframe   = 20o
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }

From 205f3e963596dee548183e5a34ec081ab5e6df08 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:48:42 +0100
Subject: [PATCH 40/93] MON-77 update readme

---
 cloud/azure/eventhub/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md
index 6e40955..f4db2d6 100644
--- a/cloud/azure/eventhub/README.md
+++ b/cloud/azure/eventhub/README.md
@@ -29,15 +29,15 @@ Inputs
 |------|-------------|:----:|:-----:|:-----:|
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| provider | What is the monitored provider | string | - | yes |
-| service | What is the monitored service | string | - | yes |
 | errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
 | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
 | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
 | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
+| provider | What is the monitored provider | string | azure | no |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 | subscription_id | Azure account id used as filter for monitors | string | - | yes |
+| service | What is the monitored service | string | storage | no |
 
 Outputs
 -------

From 5df915df51e3f0d17badc0c38b9e6e76770e80fe Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 16:36:18 +0100
Subject: [PATCH 41/93] MON-77 Fix unattended char

---
 cloud/azure/eventhub/monitors-eventhub.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 89a3d8a..733e141 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -94,7 +94,7 @@ resource "datadog_monitor" "eventhub_errors" {
   locked              = false
   require_full_window = true
   new_host_delay      = "${var.delay}"
-  no_data_timeframe   = 20o
+  no_data_timeframe   = 20
 
   tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }

From 6c10a32ff3303db46f8da3b746a9f1df3a0b35ae Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 16:50:04 +0100
Subject: [PATCH 42/93] MON-77 Normalize monitors

---
 cloud/azure/eventhub/README.md            |  6 ++----
 cloud/azure/eventhub/inputs.tf            | 26 ++++++-----------------
 cloud/azure/eventhub/monitors-eventhub.tf | 18 ++++++++--------
 3 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md
index f4db2d6..b2573da 100644
--- a/cloud/azure/eventhub/README.md
+++ b/cloud/azure/eventhub/README.md
@@ -33,11 +33,9 @@ Inputs
 | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
 | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
 | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
-| provider | What is the monitored provider | string | azure | no |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
-| subscription_id | Azure account id used as filter for monitors | string | - | yes |
-| service | What is the monitored service | string | storage | no |
 
 Outputs
 -------
diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf
index d520dc2..b41fdf5 100644
--- a/cloud/azure/eventhub/inputs.tf
+++ b/cloud/azure/eventhub/inputs.tf
@@ -4,23 +4,6 @@ variable "environment" {
   type = "string"
 }
 
-variable "subscription_id" {
-  description = "Azure account id used as filter for monitors"
-  type = "string"
-}
-
-variable "provider" {
-  description = "Cloud provider which the monitor and its based metric depend on"
-  type = "string"
-  default = "azure"
-}
-
-variable "service" {
-  description = "Service monitored by this set of monitors"
-  type = "string"
-  default = "storage"
-}
-
 # Global DataDog
 variable "message" {
   description = "Message sent when an alert is triggered"
@@ -31,11 +14,16 @@ variable "delay" {
   default = 600
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
   default     = "true"
 }
 
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
 variable "failed_requests_rate_thresold_critical" {
   description = "Failed requests ratio (percentage) to trigger the critical alert"
   default = 3
diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 733e141..ff52507 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -2,12 +2,12 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
 resource "datadog_monitor" "eventhub_status" {
-  name    = "[${var.environment}] Event Hub status"
+  name    = "[${var.environment}] Event Hub status is not ok on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -26,11 +26,11 @@ resource "datadog_monitor" "eventhub_status" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "eventhub_failed_requests" {
-  name    = "[${var.environment}] Event Hub failed requests"
+  name    = "[${var.environment}] Event Hub too much failed requests on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -41,7 +41,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
           avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) > ${var.failed_requests_rate_thresold_critical}
         EOF
-  type  = "query alert"
+  type  = "metric alert"
 
   thresholds {
     critical = "${var.failed_requests_rate_thresold_critical}"
@@ -59,11 +59,11 @@ resource "datadog_monitor" "eventhub_failed_requests" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "eventhub_errors" {
-  name    = "[${var.environment}] Event Hub errors"
+  name    = "[${var.environment}] Event Hub too much errors on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
@@ -78,7 +78,7 @@ resource "datadog_monitor" "eventhub_errors" {
           avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
         ) > ${var.errors_rate_thresold_critical}
         EOF
-  type  = "query alert"
+  type  = "metric alert"
 
   thresholds {
     critical = "${var.errors_rate_thresold_critical}"
@@ -96,5 +96,5 @@ resource "datadog_monitor" "eventhub_errors" {
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
 }

From 6e6147088cbb58c322031c3f2169001025a0cae3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Mon, 30 Oct 2017 11:34:42 +0100
Subject: [PATCH 43/93] MON-76: Azure Redis - DataDog Monitors

---
 cloud/azure/redis/inputs.tf               | 31 +++++++++++++++++
 cloud/azure/redis/monitors-azure-redis.tf | 42 +++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 cloud/azure/redis/inputs.tf
 create mode 100644 cloud/azure/redis/monitors-azure-redis.tf

diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
new file mode 100644
index 0000000..70eba23
--- /dev/null
+++ b/cloud/azure/redis/inputs.tf
@@ -0,0 +1,31 @@
+# Global Terraform
+variable "client_name" {
+  type = "string"
+}
+
+variable "environment" {
+  type = "string"
+}
+
+variable "stack" {
+  type = "string"
+}
+
+# Global DataDog
+variable "critical_escalation_group" {
+}
+
+variable "warning_escalation_group" {
+}
+
+variable "delay" {
+  default = 600
+}
+
+# Azure Redis specific
+variable "evictedkeys_threshold_warning" {
+  default = 0
+}
+variable "evictedkeys_threshold_critical" {
+  default = 100
+}
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
new file mode 100644
index 0000000..ec562b9
--- /dev/null
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -0,0 +1,42 @@
+resource "datadog_monitor" "status" {
+  name    = "[${var.environment}] Redis {{name}} is down"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1"
+  type  = "query alert"
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "evictedkeys" {
+  name    = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
+  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+
+  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.evictedkeys_threshold_warning}"
+    critical = "${var.evictedkeys_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 60
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}

From 9112ce02a390dc783a85ee6a92b65239b6f35d6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Mon, 30 Oct 2017 16:37:03 +0100
Subject: [PATCH 44/93] MON-76: Uses the generic message

---
 cloud/azure/redis/inputs.tf               | 5 +----
 cloud/azure/redis/monitors-azure-redis.tf | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index 70eba23..3f9460f 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -12,10 +12,7 @@ variable "stack" {
 }
 
 # Global DataDog
-variable "critical_escalation_group" {
-}
-
-variable "warning_escalation_group" {
+variable "message" {
 }
 
 variable "delay" {
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index ec562b9..8b47249 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -1,6 +1,6 @@
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Redis {{name}} is down"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1"
   type  = "query alert"
@@ -19,7 +19,7 @@ resource "datadog_monitor" "status" {
 
 resource "datadog_monitor" "evictedkeys" {
   name    = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
-  message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
+  message = "${var.message}"
 
   query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}"
   type  = "query alert"

From 9f1051097e4b42b37f3814a7cb6d139f537ba280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Mon, 30 Oct 2017 17:44:30 +0100
Subject: [PATCH 45/93] MON-76: More monitors

---
 cloud/azure/redis/inputs.tf               | 20 +++++---
 cloud/azure/redis/monitors-azure-redis.tf | 58 +++++++++++++++++++++--
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index 3f9460f..f13b4cb 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -7,10 +7,6 @@ variable "environment" {
   type = "string"
 }
 
-variable "stack" {
-  type = "string"
-}
-
 # Global DataDog
 variable "message" {
 }
@@ -20,9 +16,21 @@ variable "delay" {
 }
 
 # Azure Redis specific
-variable "evictedkeys_threshold_warning" {
+variable "evictedkeys_limit_threshold_warning" {
   default = 0
 }
-variable "evictedkeys_threshold_critical" {
+variable "evictedkeys_limit_threshold_critical" {
   default = 100
 }
+variable "percent_processor_time_threshold_critical" {
+  default = 80
+}
+variable "percent_processor_time_threshold_warning" {
+  default = 60
+}
+variable "server_load_rate_threshold_critical" {
+  default = 90
+}
+variable "server_load_rate_threshold_warning" {
+  default = 70
+}
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 8b47249..d4b21b5 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -7,7 +7,7 @@ resource "datadog_monitor" "status" {
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true
@@ -21,17 +21,65 @@ resource "datadog_monitor" "evictedkeys" {
   name    = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}"
+  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}"
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.evictedkeys_threshold_warning}"
-    critical = "${var.evictedkeys_threshold_critical}"
+    warning  = "${var.evictedkeys_limit_threshold_warning}"
+    critical = "${var.evictedkeys_limit_threshold_critical}"
   }
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
-  renotify_interval   = 60
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "percent_processor_time" {
+  name    = "[${var.environment}] Redis processor time {{value}}% on {{name}}"
+  message = "${var.message}"
+
+  query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{*} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.percent_processor_time_threshold_warning}"
+    critical = "${var.percent_processor_time_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "server_load" {
+  name    = "[${var.environment}] Redis processor server load {{value}}% on {{name}}"
+  message = "${var.message}"
+
+  query = "avg(last_5m):avg:azure.cache_redis.server_load{*} by {name,resource_group} > ${var.server_load_rate_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    warning  = "${var.server_load_rate_threshold_critical}"
+    critical = "${var.server_load_rate_threshold_warning}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true

From 386ad343a54753b73956e831c28cce39f14088ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 10:34:48 +0100
Subject: [PATCH 46/93] MON-76: Filter tags option

---
 cloud/azure/redis/inputs.tf               | 12 ++++++++++++
 cloud/azure/redis/monitors-azure-redis.tf | 16 ++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index f13b4cb..a96cc51 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -9,28 +9,40 @@ variable "environment" {
 
 # Global DataDog
 variable "message" {
+  description = "Message sent when a Redis monitor is triggered"
 }
 
 variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
   default = 600
 }
 
+variable "use_filter_tags" {
+  description = "Filter the data with service tags if true"
+  default     = "true"
+}
+
 # Azure Redis specific
 variable "evictedkeys_limit_threshold_warning" {
   default = 0
 }
+
 variable "evictedkeys_limit_threshold_critical" {
   default = 100
 }
+
 variable "percent_processor_time_threshold_critical" {
   default = 80
 }
+
 variable "percent_processor_time_threshold_warning" {
   default = 60
 }
+
 variable "server_load_rate_threshold_critical" {
   default = 90
 }
+
 variable "server_load_rate_threshold_warning" {
   default = 70
 }
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index d4b21b5..6931afe 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -1,8 +1,16 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}"
+  }
+}
+
 resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Redis {{name}} is down"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1"
+  query = "avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1"
   type  = "query alert"
 
   notify_no_data      = false
@@ -21,7 +29,7 @@ resource "datadog_monitor" "evictedkeys" {
   name    = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}"
+  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -45,7 +53,7 @@ resource "datadog_monitor" "percent_processor_time" {
   name    = "[${var.environment}] Redis processor time {{value}}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{*} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}"
+  query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}"
   type  = "query alert"
 
   thresholds {
@@ -69,7 +77,7 @@ resource "datadog_monitor" "server_load" {
   name    = "[${var.environment}] Redis processor server load {{value}}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.server_load{*} by {name,resource_group} > ${var.server_load_rate_threshold_critical}"
+  query = "avg(last_5m):avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.server_load_rate_threshold_critical}"
   type  = "query alert"
 
   thresholds {

From 8aab6d99b025ff3bf4375a8cc7310ac65edfe749 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 10:38:20 +0100
Subject: [PATCH 47/93] MON-76: Multiple line queries  for better readibility

---
 cloud/azure/redis/monitors-azure-redis.tf | 26 +++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 6931afe..b3ad63a 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -10,7 +10,9 @@ resource "datadog_monitor" "status" {
   name    = "[${var.environment}] Redis {{name}} is down"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1"
+  query = <<EOF
+    avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1
+EOF
   type  = "query alert"
 
   notify_no_data      = false
@@ -29,7 +31,11 @@ resource "datadog_monitor" "evictedkeys" {
   name    = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group}
+     ) > ${var.evictedkeys_limit_threshold_critical}
+EOF
   type  = "query alert"
 
   thresholds {
@@ -53,7 +59,11 @@ resource "datadog_monitor" "percent_processor_time" {
   name    = "[${var.environment}] Redis processor time {{value}}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.percent_processor_time_threshold_critical}
+EOF
   type  = "query alert"
 
   thresholds {
@@ -77,12 +87,16 @@ resource "datadog_monitor" "server_load" {
   name    = "[${var.environment}] Redis processor server load {{value}}% on {{name}}"
   message = "${var.message}"
 
-  query = "avg(last_5m):avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.server_load_rate_threshold_critical}"
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.server_load_rate_threshold_critical}
+EOF
   type  = "query alert"
 
   thresholds {
-    warning  = "${var.server_load_rate_threshold_critical}"
-    critical = "${var.server_load_rate_threshold_warning}"
+    warning  = "${var.server_load_rate_threshold_warning}"
+    critical = "${var.server_load_rate_threshold_critical}"
   }
 
   notify_no_data      = false

From 505e0df14c86116c1c30a91d64fbc6f352bc2a7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 10:53:58 +0100
Subject: [PATCH 48/93] MON-76: Add Readme

---
 cloud/azure/redis/README.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 cloud/azure/redis/README.md

diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md
new file mode 100644
index 0000000..b5acaaa
--- /dev/null
+++ b/cloud/azure/redis/README.md
@@ -0,0 +1,32 @@
+Azure Redis DataDog monitors
+============================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-redis" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+  client_name = "${var.client_name}"
+}
+```
+
+Purpose
+-------
+Creates a DataDog monitors with the following checks :
+
+* Service status check
+* Evicted keys count check
+* Processor time (percent) threshold
+* Server CPU load threshold
+
+Related documentation
+---------------------
+
+DataDog documentation: https://docs.datadoghq.com/integrations/azure_redis_cache/
+
+Azure Redis metrics documentation: https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor

From 814ee2838da545e1fd75592038e4f7e7fe2dd4f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:04:38 +0100
Subject: [PATCH 49/93] MON-76: Readme update with inputs

---
 cloud/azure/redis/README.md               | 17 +++++++++++++++
 cloud/azure/redis/inputs.tf               | 26 +++++++++++++++--------
 cloud/azure/redis/monitors-azure-redis.tf | 12 +++++++----
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md
index b5acaaa..d885193 100644
--- a/cloud/azure/redis/README.md
+++ b/cloud/azure/redis/README.md
@@ -24,6 +24,23 @@ Creates a DataDog monitors with the following checks :
 * Processor time (percent) threshold
 * Server CPU load threshold
 
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| client_name | Client name | string | - | yes |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
+| evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
+| message | Message sent when a Redis monitor is triggered | string | - | yes |
+| percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
+| percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
+| server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
+| server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
 Related documentation
 ---------------------
 
diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index a96cc51..89385e8 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -1,10 +1,12 @@
 # Global Terraform
 variable "client_name" {
-  type = "string"
+  description = "Client name"
+  type        = "string"
 }
 
 variable "environment" {
-  type = "string"
+  description = "Architecture environment"
+  type        = "string"
 }
 
 # Global DataDog
@@ -14,7 +16,7 @@ variable "message" {
 
 variable "delay" {
   description = "Delay in seconds for the metric evaluation"
-  default = 600
+  default     = 600
 }
 
 variable "use_filter_tags" {
@@ -24,25 +26,31 @@ variable "use_filter_tags" {
 
 # Azure Redis specific
 variable "evictedkeys_limit_threshold_warning" {
-  default = 0
+  description = "Evicted keys limit (warning threshold)"
+  default     = 0
 }
 
 variable "evictedkeys_limit_threshold_critical" {
-  default = 100
+  description = "Evicted keys limit (critical threshold)"
+  default     = 100
 }
 
 variable "percent_processor_time_threshold_critical" {
-  default = 80
+  description = "Processor time percent (critical threshold)"
+  default     = 80
 }
 
 variable "percent_processor_time_threshold_warning" {
-  default = 60
+  description = "Processor time percent (warning threshold)"
+  default     = 60
 }
 
 variable "server_load_rate_threshold_critical" {
-  default = 90
+  description = "Server CPU load rate (critical threshold)"
+  default     = 90
 }
 
 variable "server_load_rate_threshold_warning" {
-  default = 70
+  description = "Server CPU load rate (warning threshold)"
+  default     = 70
 }
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index b3ad63a..8287dad 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -13,7 +13,8 @@ resource "datadog_monitor" "status" {
   query = <<EOF
     avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1
 EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   notify_no_data      = false
   evaluation_delay    = "${var.delay}"
@@ -36,7 +37,8 @@ resource "datadog_monitor" "evictedkeys" {
       avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group}
      ) > ${var.evictedkeys_limit_threshold_critical}
 EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
     warning  = "${var.evictedkeys_limit_threshold_warning}"
@@ -64,7 +66,8 @@ resource "datadog_monitor" "percent_processor_time" {
       avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group}
     ) > ${var.percent_processor_time_threshold_critical}
 EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
     warning  = "${var.percent_processor_time_threshold_warning}"
@@ -92,7 +95,8 @@ resource "datadog_monitor" "server_load" {
       avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group}
     ) > ${var.server_load_rate_threshold_critical}
 EOF
-  type  = "query alert"
+
+  type = "query alert"
 
   thresholds {
     warning  = "${var.server_load_rate_threshold_warning}"

From c624b041a42121fc631fd9dcd27497e6351fe9e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:09:46 +0100
Subject: [PATCH 50/93] MON-76: Uses the right DD tag for Azure Redis

---
 cloud/azure/redis/README.md               | 4 ++--
 cloud/azure/redis/monitors-azure-redis.tf | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md
index d885193..8520c6b 100644
--- a/cloud/azure/redis/README.md
+++ b/cloud/azure/redis/README.md
@@ -44,6 +44,6 @@ Inputs
 Related documentation
 ---------------------
 
-DataDog documentation: https://docs.datadoghq.com/integrations/azure_redis_cache/
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure_redis_cache/](https://docs.datadoghq.com/integrations/azure_redis_cache/)
 
-Azure Redis metrics documentation: https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor
+Azure Redis metrics documentation: [https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor](https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor)
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 8287dad..92652e9 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -2,7 +2,7 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}"
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "*"}"
   }
 }
 

From 0a4345dfa39618213c71e8200153b03fd0bf5645 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 3 Nov 2017 20:56:04 +0100
Subject: [PATCH 51/93] MON-76 add subscription_id and tags, remove client_name

---
 cloud/azure/redis/README.md               |  7 ++++---
 cloud/azure/redis/inputs.tf               | 24 +++++++++++++++++------
 cloud/azure/redis/monitors-azure-redis.tf | 10 +++++++++-
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md
index 8520c6b..45538d1 100644
--- a/cloud/azure/redis/README.md
+++ b/cloud/azure/redis/README.md
@@ -9,9 +9,8 @@ module "datadog-monitors-azure-redis" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}"
 
   message = "${module.datadog-message-alerting.alerting-message}"
-
   environment = "${var.environment}"
-  client_name = "${var.client_name}"
+  subscription_id = "${var.subscription_id}"
 }
 ```
 
@@ -29,7 +28,6 @@ Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
-| client_name | Client name | string | - | yes |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
 | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
@@ -37,9 +35,12 @@ Inputs
 | message | Message sent when a Redis monitor is triggered | string | - | yes |
 | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
 | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
+| provider | What is the monitored provider | string | azure | no |
 | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
 | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+| service | What is the monitored service | string | storage | no |
+| subscription_id | Azure account id used as filter for monitors | string | - | yes |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index 89385e8..7c57d63 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -1,12 +1,24 @@
 # Global Terraform
-variable "client_name" {
-  description = "Client name"
-  type        = "string"
-}
-
 variable "environment" {
   description = "Architecture environment"
-  type        = "string"
+  type = "string"
+}
+
+variable "subscription_id" {
+  description = "Azure account id used as filter for monitors"
+  type = "string"
+}
+
+variable "provider" {
+  description = "Cloud provider which the monitor and its based metric depend on"
+  type = "string"
+  default = "azure"
+}
+
+variable "service" {
+  description = "Service monitored by this set of monitors"
+  type = "string"
+  default = "storage"
 }
 
 # Global DataDog
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 92652e9..950e9a1 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -2,7 +2,7 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "*"}"
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
   }
 }
 
@@ -26,6 +26,8 @@ EOF
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "evictedkeys" {
@@ -55,6 +57,8 @@ EOF
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "percent_processor_time" {
@@ -84,6 +88,8 @@ EOF
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }
 
 resource "datadog_monitor" "server_load" {
@@ -113,4 +119,6 @@ EOF
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
 }

From 753da1173437df811d43b65b651ccb1a98d63122 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 17:12:16 +0100
Subject: [PATCH 52/93] MON-76 Normalize monitors

---
 cloud/azure/redis/README.md               |  9 +++-----
 cloud/azure/redis/inputs.tf               | 26 ++++++-----------------
 cloud/azure/redis/monitors-azure-redis.tf | 20 ++++++++---------
 3 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md
index 45538d1..4cd7a51 100644
--- a/cloud/azure/redis/README.md
+++ b/cloud/azure/redis/README.md
@@ -8,9 +8,8 @@ How to use this module
 module "datadog-monitors-azure-redis" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}"
 
-  message = "${module.datadog-message-alerting.alerting-message}"
+  message     = "${module.datadog-message-alerting.alerting-message}"
   environment = "${var.environment}"
-  subscription_id = "${var.subscription_id}"
 }
 ```
 
@@ -32,15 +31,13 @@ Inputs
 | environment | Architecture environment | string | - | yes |
 | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
 | evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | message | Message sent when a Redis monitor is triggered | string | - | yes |
 | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
 | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
-| provider | What is the monitored provider | string | azure | no |
 | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
 | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
-| service | What is the monitored service | string | storage | no |
-| subscription_id | Azure account id used as filter for monitors | string | - | yes |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf
index 7c57d63..49750fa 100644
--- a/cloud/azure/redis/inputs.tf
+++ b/cloud/azure/redis/inputs.tf
@@ -4,23 +4,6 @@ variable "environment" {
   type = "string"
 }
 
-variable "subscription_id" {
-  description = "Azure account id used as filter for monitors"
-  type = "string"
-}
-
-variable "provider" {
-  description = "Cloud provider which the monitor and its based metric depend on"
-  type = "string"
-  default = "azure"
-}
-
-variable "service" {
-  description = "Service monitored by this set of monitors"
-  type = "string"
-  default = "storage"
-}
-
 # Global DataDog
 variable "message" {
   description = "Message sent when a Redis monitor is triggered"
@@ -31,11 +14,16 @@ variable "delay" {
   default     = 600
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
   default     = "true"
 }
 
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
 # Azure Redis specific
 variable "evictedkeys_limit_threshold_warning" {
   description = "Evicted keys limit (warning threshold)"
diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 950e9a1..57b3a6c 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -2,7 +2,7 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
@@ -14,9 +14,9 @@ resource "datadog_monitor" "status" {
     avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1
 EOF
 
-  type = "query alert"
+  type = "metric alert"
 
-  notify_no_data      = false
+  notify_no_data      = true
   evaluation_delay    = "${var.delay}"
   renotify_interval   = 0
   notify_audit        = false
@@ -27,7 +27,7 @@ EOF
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "evictedkeys" {
@@ -40,7 +40,7 @@ resource "datadog_monitor" "evictedkeys" {
      ) > ${var.evictedkeys_limit_threshold_critical}
 EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.evictedkeys_limit_threshold_warning}"
@@ -58,7 +58,7 @@ EOF
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "percent_processor_time" {
@@ -71,7 +71,7 @@ resource "datadog_monitor" "percent_processor_time" {
     ) > ${var.percent_processor_time_threshold_critical}
 EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.percent_processor_time_threshold_warning}"
@@ -89,7 +89,7 @@ EOF
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "server_load" {
@@ -102,7 +102,7 @@ resource "datadog_monitor" "server_load" {
     ) > ${var.server_load_rate_threshold_critical}
 EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     warning  = "${var.server_load_rate_threshold_warning}"
@@ -120,5 +120,5 @@ EOF
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
 
-  tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
+  tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
 }

From 71d78bacdec6dd85a13fcee5c911088386aed37a Mon Sep 17 00:00:00 2001
From: Kevin Pecquet <kpecquet@morea.fr>
Date: Mon, 30 Oct 2017 15:48:26 +0100
Subject: [PATCH 53/93] MON-75 SQL DB monitors init

---
 cloud/azure/sql-database/README.md            |  44 +++++++
 cloud/azure/sql-database/inputs.tf            |  49 ++++++++
 .../monitors-sql-database-basics.tf           | 109 ++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 cloud/azure/sql-database/README.md
 create mode 100644 cloud/azure/sql-database/inputs.tf
 create mode 100644 cloud/azure/sql-database/monitors-sql-database-basics.tf

diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md
new file mode 100644
index 0000000..5fb0387
--- /dev/null
+++ b/cloud/azure/sql-database/README.md
@@ -0,0 +1,44 @@
+Azure SQL Database DataDog monitors
+============================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-storage" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+  client_name = "${var.client_name}"
+}
+```
+
+Purpose
+-------
+Creates a DataDog monitors with the following checks :
+
+* CPU High
+* Free disk space low
+* DTU Consumption high
+* SQL deadlocks
+
+Inputs
+------
+
+| Name | Type | Default | Required |
+|------|:----:|:-------:|:--------:|
+| client_name | Client name | string | - | yes |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| message | Message sent when a monitor is triggered | string | - | yes |
+| use_filter_tags | Filter the data with service tags if true | string | `false` | no |
+| dd_azure_sqldb | string | `disabled` | yes |
+| cpu_threshold_warning | string | `85` | no |
+| cpu_threshold_critical | string | `90` | no |
+| diskspace_threshold_warning | string | `80` | no |
+| diskspace_threshold_critical | string | `90` | no |
+| dtu_threshold_warning | string | `85` | no |
+| dtu_threshold_critical | string | `90` | no |
+| deadlock_threshold_critical | string | `1` | no |
diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf
new file mode 100644
index 0000000..77599b9
--- /dev/null
+++ b/cloud/azure/sql-database/inputs.tf
@@ -0,0 +1,49 @@
+variable "subscription_id" {
+  default = ""
+}
+
+variable "message" {
+  description = "Message sent when a SQL DB monitor is triggered"
+}
+
+variable "environment" {}
+
+variable "use_filter_tags" {
+  default = "false"
+}
+
+variable "cpu_threshold_warning" {
+  default = ""
+}
+
+variable "cpu_threshold_critical" {
+  default = "90"
+}
+
+variable "diskspace_threshold_warning" {
+  default = "80"
+}
+
+variable "diskspace_threshold_critical" {
+  default = "90"
+}
+
+variable "dtu_threshold_warning" {
+  default = "85"
+}
+
+variable "dtu_threshold_critical" {
+  default = "90"
+}
+
+variable "deadlock_threshold_critical" {
+  default = "1"
+}
+
+variable "delay" {
+  default = "600"
+}
+
+variable "dd_azure_sqldb" {
+  default = "disabled"
+}
diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf
new file mode 100644
index 0000000..413e4020
--- /dev/null
+++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf
@@ -0,0 +1,109 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_sqldb:enabled,env:%s",var.environment) : "*"}"
+  }
+}
+
+resource "datadog_monitor" "sql-database_cpu_90_15min" {
+  name    = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}"
+  message = "${message}"
+
+  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+
+  query = "avg(last_15m):avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.cpu_threshold_critical}"
+  type  = "query alert"
+
+  thresholds {
+    critical = "${var.cpu_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "sql-database_free_space_low" {
+  name    = "[${var.environment}] SQL Database free space < 10 % on {{name}}"
+  message = "${message}"
+
+  type  = "query alert"
+  query = "avg(last_15m):avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90"
+
+  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+
+  thresholds {
+    warning  = "${var.diskspace_threshold_warning}"
+    critical = "${var.diskspace_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "sql-database_dtu_consumption_high" {
+  name    = "[${var.environment}] DTU Consumption on {{name}} > 90"
+  message = "${message}"
+
+  type  = "query alert"
+  query = "avg(last_15m):azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90"
+
+  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+
+  thresholds {
+    warning  = "${var.dtu_threshold_warning}"
+    critical = "${var.dtu_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}
+
+resource "datadog_monitor" "sql-database_deadlocks_count" {
+  name    = "[${var.environment}] SQL Deadlocks too high on {{name}}"
+  message = "${message}"
+
+  type  = "query alert"
+  query = "sum(last_5m):avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() > ${var.deadlock_threshold_critical}"
+
+  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+
+  thresholds {
+    critical = "${var.deadlock_threshold_critical}"
+  }
+
+  notify_no_data      = false
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+}

From 220dfe019dec2656687f8eddb25a18146159cb06 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 7 Nov 2017 11:58:54 +0100
Subject: [PATCH 54/93] MON-75 Add some descriptions and add EOF on queries

---
 cloud/azure/sql-database/README.md            | 29 +++++----
 cloud/azure/sql-database/inputs.tf            | 65 +++++++++++++------
 .../monitors-sql-database-basics.tf           | 36 ++++++----
 3 files changed, 84 insertions(+), 46 deletions(-)

diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md
index 5fb0387..f135036 100644
--- a/cloud/azure/sql-database/README.md
+++ b/cloud/azure/sql-database/README.md
@@ -27,18 +27,19 @@ Creates a DataDog monitors with the following checks :
 Inputs
 ------
 
-| Name | Type | Default | Required |
-|------|:----:|:-------:|:--------:|
-| client_name | Client name | string | - | yes |
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
+| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no |
+| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
-| environment | Architecture environment | string | - | yes |
-| message | Message sent when a monitor is triggered | string | - | yes |
-| use_filter_tags | Filter the data with service tags if true | string | `false` | no |
-| dd_azure_sqldb | string | `disabled` | yes |
-| cpu_threshold_warning | string | `85` | no |
-| cpu_threshold_critical | string | `90` | no |
-| diskspace_threshold_warning | string | `80` | no |
-| diskspace_threshold_critical | string | `90` | no |
-| dtu_threshold_warning | string | `85` | no |
-| dtu_threshold_critical | string | `90` | no |
-| deadlock_threshold_critical | string | `1` | no |
+| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
+| diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
+| dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
+| dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
+| environment | Architecture Environment | string | - | yes |
+| message | Message sent when an alert is triggered | string | - | yes |
+| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no |
+| service | Service monitored by this set of monitors | string | `sql-database` | no |
+| subscription_id | Azure account id used as filter for monitors | string | - | yes |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf
index 77599b9..d3abe46 100644
--- a/cloud/azure/sql-database/inputs.tf
+++ b/cloud/azure/sql-database/inputs.tf
@@ -1,49 +1,74 @@
+# Global Terraform
+variable "environment" {
+  description = "Architecture Environment"
+  type        = "string"
+}
+
 variable "subscription_id" {
-  default = ""
+  description = "Azure account id used as filter for monitors"
+  type        = "string"
+}
+
+variable "provider" {
+  description = "Cloud provider which the monitor and its based metric depend on"
+  type        = "string"
+  default     = "azure"
+}
+
+variable "service" {
+  description = "Service monitored by this set of monitors"
+  type        = "string"
+  default     = "sql-database"
+}
+
+# Global DataDog
+variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
 }
 
 variable "message" {
-  description = "Message sent when a SQL DB monitor is triggered"
+  description = "Message sent when an alert is triggered"
 }
 
-variable "environment" {}
-
 variable "use_filter_tags" {
-  default = "false"
+  description = "Filter the data with service tags if true"
+  default     = "true"
 }
 
+# Azure SQL Database specific
+
 variable "cpu_threshold_warning" {
-  default = ""
+  description = "CPU usage in percent (warning threshold)"
+  default     = ""
 }
 
 variable "cpu_threshold_critical" {
-  default = "90"
+  description = "CPU usage in percent (critical threshold)"
+  default     = "90"
 }
 
 variable "diskspace_threshold_warning" {
-  default = "80"
+  description = "Disk space used in percent (warning threshold)"
+  default     = "80"
 }
 
 variable "diskspace_threshold_critical" {
-  default = "90"
+  description = "Disk space used in percent (critical threshold)"
+  default     = "90"
 }
 
 variable "dtu_threshold_warning" {
-  default = "85"
+  description = "Amount of DTU used (warning threshold)"
+  default     = "85"
 }
 
 variable "dtu_threshold_critical" {
-  default = "90"
+  description = "Amount of DTU used (critical threshold)"
+  default     = "90"
 }
 
 variable "deadlock_threshold_critical" {
-  default = "1"
-}
-
-variable "delay" {
-  default = "600"
-}
-
-variable "dd_azure_sqldb" {
-  default = "disabled"
+  description = "Amount of Deadlocks (critical threshold)"
+  default     = "1"
 }
diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf
index 413e4020..5448b8a 100644
--- a/cloud/azure/sql-database/monitors-sql-database-basics.tf
+++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf
@@ -10,10 +10,13 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
   name    = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}"
   message = "${message}"
 
-  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+  query = <<EOF
+    avg(last_15m): (
+      avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.cpu_threshold_critical}
+  EOF
 
-  query = "avg(last_15m):avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.cpu_threshold_critical}"
-  type  = "query alert"
+  type = "query alert"
 
   thresholds {
     critical = "${var.cpu_threshold_critical}"
@@ -35,10 +38,13 @@ resource "datadog_monitor" "sql-database_free_space_low" {
   name    = "[${var.environment}] SQL Database free space < 10 % on {{name}}"
   message = "${message}"
 
-  type  = "query alert"
-  query = "avg(last_15m):avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90"
+  type = "query alert"
 
-  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+  query = <<EOF
+    avg(last_15m): (
+      avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.diskspace_threshold_critical}
+  EOF
 
   thresholds {
     warning  = "${var.diskspace_threshold_warning}"
@@ -61,10 +67,13 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
   name    = "[${var.environment}] DTU Consumption on {{name}} > 90"
   message = "${message}"
 
-  type  = "query alert"
-  query = "avg(last_15m):azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90"
+  type = "query alert"
 
-  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+  query = <<EOF
+    avg(last_15m): (
+      azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+    ) > ${var.dtu_threshold_critical}
+  EOF
 
   thresholds {
     warning  = "${var.dtu_threshold_warning}"
@@ -87,10 +96,13 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
   name    = "[${var.environment}] SQL Deadlocks too high on {{name}}"
   message = "${message}"
 
-  type  = "query alert"
-  query = "sum(last_5m):avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() > ${var.deadlock_threshold_critical}"
+  type = "query alert"
 
-  count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }"
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+    ) > ${var.deadlock_threshold_critical}
+  EOF
 
   thresholds {
     critical = "${var.deadlock_threshold_critical}"

From 6c5bdaa042c11f2a4217b3b3179f1001d407ea56 Mon Sep 17 00:00:00 2001
From: Marc-Antoine ADELISE <maadelise@morea.fr>
Date: Mon, 30 Oct 2017 16:32:09 +0100
Subject: [PATCH 55/93] MON-74: Added first Azure App Services resources

---
 cloud/azure/app-services/inputs.tf            | 107 ++++++++++++++++++
 .../app-services/monitors-app_services.tf     |  49 ++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 cloud/azure/app-services/inputs.tf
 create mode 100644 cloud/azure/app-services/monitors-app_services.tf

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
new file mode 100644
index 0000000..830fcc2
--- /dev/null
+++ b/cloud/azure/app-services/inputs.tf
@@ -0,0 +1,107 @@
+variable "filter_tags" {
+  default = "*"
+}
+
+###################################
+###   RESPONSE TIME VARIABLES   ###
+###################################
+variable "response_time_appserv_eval_delay" {
+  default = 600
+}
+
+variable "response_time_critical_threshold" {
+  default = 0.8
+  description = "Alerting threshold in seconds"
+}
+
+variable "response_time_threshold_warning" {
+  default = 0.4
+  description = "Warning threshold in seconds"
+}
+
+variable "response_time_last_time_window_code" {
+  default = "1h"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "response_time_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "response_time_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "response_time_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+variable "response_time_notify_no_data" {
+  default = true
+  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+}
+
+variable "response_time_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "response_time_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
+
+###################################
+###   MEMORY USAGE VARIABLES   ###
+###################################
+variable "memory_usage_appserv_eval_delay" {
+  default = 600
+}
+
+variable "memory_usage_threshold_critical" {
+  default = 52430000
+  description = "Alerting threshold in Mib"
+}
+
+variable "memory_usage_threshold_warning" {
+  default = 33550000
+  description = "Warning threshold in MiB"
+}
+
+variable "memory_usage_last_time_window_code" {
+  default = "5m"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "memory_usage_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "memory_usage_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "memory_usage_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+variable "memory_usage_notify_no_data" {
+  default = true
+  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+}
+
+variable "memory_usage_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "memory_usage_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
new file mode 100644
index 0000000..7bf1f99
--- /dev/null
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -0,0 +1,49 @@
+# Monitoring App Services response time
+resource "datadog_monitor" "appservices_reponse_time" {
+  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.reponse_time_escalation_message}"
+
+  query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}"
+
+  evaluation_delay = "${var.reponse_time_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.reponse_time_threshold_warning}"
+    critical = "${var.reponse_time_threshold_critical}"
+  }
+
+  notify_no_data    = "${var.reponse_time_notify_no_data}"
+  renotify_interval = "${var.reponse_time_renotify_interval}"
+
+  timeout_h    = "${var.reponse_time_timeout_h}"
+  include_tags = "${var.reponse_time_include_tags}"
+
+  tags = "${var.reponse_time_tags}"
+}
+
+# Monitoring App Services memory usage
+resource "datadog_monitor" "appservices_memory_usage" {
+  name               = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.memory_usage_escalation_message}"
+
+  query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}"
+
+  evaluation_delay = "${var.memory_usage_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.memory_usage_threshold_warning}"
+    critical = "${var.memory_usage_threshold_critical}"
+  }
+
+  notify_no_data    = "${var.memory_usage_notify_no_data}"
+  renotify_interval = "${var.memory_usage_renotify_interval}"
+
+  timeout_h    = "${var.memory_usage_timeout_h}"
+  include_tags = "${var.memory_usage_include_tags}"
+
+  tags = "${var.memory_usage_tags}"
+}

From 81df985f3297bcf3e993fef42f3a98146339bce0 Mon Sep 17 00:00:00 2001
From: Marc-Antoine ADELISE <maadelise@morea.fr>
Date: Tue, 31 Oct 2017 10:08:19 +0100
Subject: [PATCH 56/93] MON-74: Response time, memory usage, http 404 status
 code and non 2xx http response status code percentage monitoring.

---
 cloud/azure/app-services/inputs.tf            | 158 ++++++++++++++++--
 .../app-services/monitors-app_services.tf     |  81 +++++++--
 2 files changed, 215 insertions(+), 24 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 830fcc2..666a394 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -1,7 +1,13 @@
-variable "filter_tags" {
+variable "environment" {}
+
+variable "use_filter_tags" {
   default = "*"
 }
 
+variable "critical_escalation_group" {
+  default = "HO_Dummy"
+}
+
 ###################################
 ###   RESPONSE TIME VARIABLES   ###
 ###################################
@@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" {
   default = 600
 }
 
-variable "response_time_critical_threshold" {
+variable "response_time_threshold_critical" {
   default = 0.8
   description = "Alerting threshold in seconds"
 }
@@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" {
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
+variable "response_time_require_full_window" {
+  default = false
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
 variable "response_time_tags" {
   default = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@@ -39,10 +50,10 @@ variable "response_time_include_tags" {
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
-variable "response_time_notify_no_data" {
-  default = true
-  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-}
+# variable "response_time_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
 
 variable "response_time_renotify_interval" {
   default = 0
@@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" {
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
+variable "memory_usage_require_full_window" {
+  default = false
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
 variable "memory_usage_tags" {
   default = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@@ -91,10 +107,10 @@ variable "memory_usage_include_tags" {
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
-variable "memory_usage_notify_no_data" {
-  default = true
-  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-}
+# variable "memory_usage_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
 
 variable "memory_usage_renotify_interval" {
   default = 0
@@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" {
   default = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
+
+#################################
+###   HTTP 404 status pages   ###
+#################################
+variable "http_404_errors_count_rate_limit" {
+  default = 30
+}
+
+variable "http_404_errors_count_rate_appserv_eval_delay" {
+  default = 600
+}
+
+variable "http_404_errors_count_rate_threshold_critical" {
+  default = 30
+  description = "Alerting threshold (number of requests)"
+}
+
+variable "http_404_errors_count_rate_threshold_warning" {
+  default = 10
+  description = "Warning threshold (number of requests)"
+}
+
+variable "http_404_errors_count_rate_last_time_window_code" {
+  default = "5m"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "http_404_errors_count_rate_require_full_window" {
+  default = true
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
+variable "http_404_errors_count_rate_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "http_404_errors_count_rate_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "http_404_errors_count_rate_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+# variable "http_404_errors_count_rate_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
+
+variable "http_404_errors_count_rate_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "http_404_errors_count_rate_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
+
+#################################
+###   HTTP 202 status pages   ###
+#################################
+variable "http_2xx_status_rate_limit" {
+  default = 30
+}
+
+variable "http_2xx_status_rate_appserv_eval_delay" {
+  default = 600
+}
+
+variable "http_2xx_status_rate_threshold_critical" {
+  default = 0.9
+  description = "Alerting threshold (percentage)"
+}
+
+variable "http_2xx_status_rate_threshold_warning" {
+  default = 0.95
+  description = "Warning threshold (percentage)"
+}
+
+variable "http_2xx_status_rate_last_time_window_code" {
+  default = "5m"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "http_2xx_status_rate_require_full_window" {
+  default = true
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
+variable "http_2xx_status_rate_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "http_2xx_status_rate_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "http_2xx_status_rate_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+# variable "http_2xx_status_rate_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
+
+variable "http_2xx_status_rate_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "http_2xx_status_rate_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 7bf1f99..892b2c4 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -1,30 +1,31 @@
 # Monitoring App Services response time
-resource "datadog_monitor" "appservices_reponse_time" {
-  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s"
+resource "datadog_monitor" "appservices_response_time" {
+  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
   type               = "query alert"
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
-  escalation_message = "${var.reponse_time_escalation_message}"
+  escalation_message = "${var.response_time_escalation_message}"
 
-  query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}"
+  query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}"
 
-  evaluation_delay = "${var.reponse_time_appserv_eval_delay}"
+  evaluation_delay = "${var.response_time_appserv_eval_delay}"
 
   thresholds {
-    warning  = "${var.reponse_time_threshold_warning}"
-    critical = "${var.reponse_time_threshold_critical}"
+    warning  = "${var.response_time_threshold_warning}"
+    critical = "${var.response_time_threshold_critical}"
   }
 
-  notify_no_data    = "${var.reponse_time_notify_no_data}"
-  renotify_interval = "${var.reponse_time_renotify_interval}"
+  notify_no_data    = true # Will notify when no data is received
+  renotify_interval = "${var.response_time_renotify_interval}"
+  require_full_window = "${var.response_time_require_full_window}"
 
-  timeout_h    = "${var.reponse_time_timeout_h}"
-  include_tags = "${var.reponse_time_include_tags}"
+  timeout_h    = "${var.response_time_timeout_h}"
+  include_tags = "${var.response_time_include_tags}"
 
-  tags = "${var.reponse_time_tags}"
+  tags = "${var.response_time_tags}"
 }
 
 # Monitoring App Services memory usage
-resource "datadog_monitor" "appservices_memory_usage" {
+resource "datadog_monitor" "appservices_memory_usage_count" {
   name               = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
   type               = "query alert"
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
@@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" {
     critical = "${var.memory_usage_threshold_critical}"
   }
 
-  notify_no_data    = "${var.memory_usage_notify_no_data}"
+  notify_no_data    = true # Will notify when no data is received
   renotify_interval = "${var.memory_usage_renotify_interval}"
+  require_full_window = "${var.memory_usage_require_full_window}"
 
   timeout_h    = "${var.memory_usage_timeout_h}"
   include_tags = "${var.memory_usage_include_tags}"
 
   tags = "${var.memory_usage_tags}"
 }
+
+# Monitoring App Services 404 errors rate
+resource "datadog_monitor" "appservices_http_404_errors_count" {
+  name               = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
+
+  query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
+
+  evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.http_404_errors_count_rate_threshold_warning}"
+    critical = "${var.http_404_errors_count_rate_threshold_critical}"
+  }
+
+  notify_no_data    = false # Will NOT notify when no data is received
+  renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
+  require_full_window = true
+
+  timeout_h    = "${var.http_404_errors_count_rate_timeout_h}"
+  include_tags = "${var.http_404_errors_count_rate_include_tags}"
+
+  tags = "${var.http_404_errors_count_rate_tags}"
+}
+
+# Monitoring App Services HTTP 2xx status pages rate
+resource "datadog_monitor" "appservices_http_2xx_status_rate" {
+  name               = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.http_2xx_status_rate_escalation_message}"
+
+  query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
+  evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.http_2xx_status_rate_threshold_warning}"
+    critical = "${var.http_2xx_status_rate_threshold_critical}"
+  }
+
+  notify_no_data    = true # Will notify when no data is received
+  renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
+  require_full_window = true
+
+  timeout_h    = "${var.http_2xx_status_rate_timeout_h}"
+  include_tags = "${var.http_2xx_status_rate_include_tags}"
+
+  tags = "${var.http_2xx_status_rate_tags}"
+}

From 58bbe0bc7bd08c92c26719b839d185f2e682c54f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:18:52 +0100
Subject: [PATCH 57/93] MON-74: fmt

---
 cloud/azure/app-services/inputs.tf            | 72 +++++++++----------
 .../app-services/monitors-app_services.tf     | 18 ++---
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 666a394..dc26017 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -16,37 +16,37 @@ variable "response_time_appserv_eval_delay" {
 }
 
 variable "response_time_threshold_critical" {
-  default = 0.8
+  default     = 0.8
   description = "Alerting threshold in seconds"
 }
 
 variable "response_time_threshold_warning" {
-  default = 0.4
+  default     = 0.4
   description = "Warning threshold in seconds"
 }
 
 variable "response_time_last_time_window_code" {
-  default = "1h"
+  default     = "1h"
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
 variable "response_time_require_full_window" {
-  default = false
+  default     = false
   description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
 }
 
 variable "response_time_tags" {
-  default = []
+  default     = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
 }
 
 variable "response_time_timeout_h" {
-  default = false
+  default     = false
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
 variable "response_time_include_tags" {
-  default = false
+  default     = false
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
@@ -56,12 +56,12 @@ variable "response_time_include_tags" {
 # }
 
 variable "response_time_renotify_interval" {
-  default = 0
+  default     = 0
   description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
 }
 
 variable "response_time_escalation_message" {
-  default = "Escalation message @pagerduty"
+  default     = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
 
@@ -73,37 +73,37 @@ variable "memory_usage_appserv_eval_delay" {
 }
 
 variable "memory_usage_threshold_critical" {
-  default = 52430000
+  default     = 52430000
   description = "Alerting threshold in Mib"
 }
 
 variable "memory_usage_threshold_warning" {
-  default = 33550000
+  default     = 33550000
   description = "Warning threshold in MiB"
 }
 
 variable "memory_usage_last_time_window_code" {
-  default = "5m"
+  default     = "5m"
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
 variable "memory_usage_require_full_window" {
-  default = false
+  default     = false
   description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
 }
 
 variable "memory_usage_tags" {
-  default = []
+  default     = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
 }
 
 variable "memory_usage_timeout_h" {
-  default = false
+  default     = false
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
 variable "memory_usage_include_tags" {
-  default = false
+  default     = false
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
@@ -113,12 +113,12 @@ variable "memory_usage_include_tags" {
 # }
 
 variable "memory_usage_renotify_interval" {
-  default = 0
+  default     = 0
   description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
 }
 
 variable "memory_usage_escalation_message" {
-  default = "Escalation message @pagerduty"
+  default     = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
 
@@ -134,37 +134,37 @@ variable "http_404_errors_count_rate_appserv_eval_delay" {
 }
 
 variable "http_404_errors_count_rate_threshold_critical" {
-  default = 30
+  default     = 30
   description = "Alerting threshold (number of requests)"
 }
 
 variable "http_404_errors_count_rate_threshold_warning" {
-  default = 10
+  default     = 10
   description = "Warning threshold (number of requests)"
 }
 
 variable "http_404_errors_count_rate_last_time_window_code" {
-  default = "5m"
+  default     = "5m"
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
 variable "http_404_errors_count_rate_require_full_window" {
-  default = true
+  default     = true
   description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
 }
 
 variable "http_404_errors_count_rate_tags" {
-  default = []
+  default     = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
 }
 
 variable "http_404_errors_count_rate_timeout_h" {
-  default = false
+  default     = false
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
 variable "http_404_errors_count_rate_include_tags" {
-  default = false
+  default     = false
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
@@ -174,12 +174,12 @@ variable "http_404_errors_count_rate_include_tags" {
 # }
 
 variable "http_404_errors_count_rate_renotify_interval" {
-  default = 0
+  default     = 0
   description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
 }
 
 variable "http_404_errors_count_rate_escalation_message" {
-  default = "Escalation message @pagerduty"
+  default     = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
 
@@ -195,37 +195,37 @@ variable "http_2xx_status_rate_appserv_eval_delay" {
 }
 
 variable "http_2xx_status_rate_threshold_critical" {
-  default = 0.9
+  default     = 0.9
   description = "Alerting threshold (percentage)"
 }
 
 variable "http_2xx_status_rate_threshold_warning" {
-  default = 0.95
+  default     = 0.95
   description = "Warning threshold (percentage)"
 }
 
 variable "http_2xx_status_rate_last_time_window_code" {
-  default = "5m"
+  default     = "5m"
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
 variable "http_2xx_status_rate_require_full_window" {
-  default = true
+  default     = true
   description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
 }
 
 variable "http_2xx_status_rate_tags" {
-  default = []
+  default     = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
 }
 
 variable "http_2xx_status_rate_timeout_h" {
-  default = false
+  default     = false
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
 variable "http_2xx_status_rate_include_tags" {
-  default = false
+  default     = false
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
@@ -235,11 +235,11 @@ variable "http_2xx_status_rate_include_tags" {
 # }
 
 variable "http_2xx_status_rate_renotify_interval" {
-  default = 0
+  default     = 0
   description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
 }
 
 variable "http_2xx_status_rate_escalation_message" {
-  default = "Escalation message @pagerduty"
+  default     = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 892b2c4..3e5f94a 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -14,8 +14,8 @@ resource "datadog_monitor" "appservices_response_time" {
     critical = "${var.response_time_threshold_critical}"
   }
 
-  notify_no_data    = true # Will notify when no data is received
-  renotify_interval = "${var.response_time_renotify_interval}"
+  notify_no_data      = true                                       # Will notify when no data is received
+  renotify_interval   = "${var.response_time_renotify_interval}"
   require_full_window = "${var.response_time_require_full_window}"
 
   timeout_h    = "${var.response_time_timeout_h}"
@@ -40,8 +40,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
     critical = "${var.memory_usage_threshold_critical}"
   }
 
-  notify_no_data    = true # Will notify when no data is received
-  renotify_interval = "${var.memory_usage_renotify_interval}"
+  notify_no_data      = true                                      # Will notify when no data is received
+  renotify_interval   = "${var.memory_usage_renotify_interval}"
   require_full_window = "${var.memory_usage_require_full_window}"
 
   timeout_h    = "${var.memory_usage_timeout_h}"
@@ -66,8 +66,8 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
     critical = "${var.http_404_errors_count_rate_threshold_critical}"
   }
 
-  notify_no_data    = false # Will NOT notify when no data is received
-  renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
+  notify_no_data      = false                                                 # Will NOT notify when no data is received
+  renotify_interval   = "${var.http_404_errors_count_rate_renotify_interval}"
   require_full_window = true
 
   timeout_h    = "${var.http_404_errors_count_rate_timeout_h}"
@@ -83,7 +83,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
   escalation_message = "${var.http_2xx_status_rate_escalation_message}"
 
-  query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
+  query            = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
   evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
 
   thresholds {
@@ -91,8 +91,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
     critical = "${var.http_2xx_status_rate_threshold_critical}"
   }
 
-  notify_no_data    = true # Will notify when no data is received
-  renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
+  notify_no_data      = true                                            # Will notify when no data is received
+  renotify_interval   = "${var.http_2xx_status_rate_renotify_interval}"
   require_full_window = true
 
   timeout_h    = "${var.http_2xx_status_rate_timeout_h}"

From 4c9bc13de0ae6365d94a3a3d311a8f3339b5bd09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:22:18 +0100
Subject: [PATCH 58/93] MON-74: Use filter tags option

---
 cloud/azure/app-services/inputs.tf            |  3 ++-
 .../app-services/monitors-app_services.tf     | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index dc26017..8af09cb 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -1,7 +1,8 @@
 variable "environment" {}
 
 variable "use_filter_tags" {
-  default = "*"
+  description = "Filter the data with service tags if true"
+  default     = "true"
 }
 
 variable "critical_escalation_group" {
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 3e5f94a..48b8184 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -1,3 +1,11 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}"
+  }
+}
+
 # Monitoring App Services response time
 resource "datadog_monitor" "appservices_response_time" {
   name               = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
@@ -5,7 +13,7 @@ resource "datadog_monitor" "appservices_response_time" {
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
   escalation_message = "${var.response_time_escalation_message}"
 
-  query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}"
+  query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}"
 
   evaluation_delay = "${var.response_time_appserv_eval_delay}"
 
@@ -31,7 +39,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
   escalation_message = "${var.memory_usage_escalation_message}"
 
-  query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}"
+  query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}"
 
   evaluation_delay = "${var.memory_usage_appserv_eval_delay}"
 
@@ -57,7 +65,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
   escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
 
-  query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
+  query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
 
   evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
 
@@ -83,7 +91,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
   escalation_message = "${var.http_2xx_status_rate_escalation_message}"
 
-  query            = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
+  query            = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
   evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
 
   thresholds {
@@ -91,7 +99,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
     critical = "${var.http_2xx_status_rate_threshold_critical}"
   }
 
-  notify_no_data      = true                                            # Will notify when no data is received
+  # Will notify when no data is received
+  notify_no_data      = true
   renotify_interval   = "${var.http_2xx_status_rate_renotify_interval}"
   require_full_window = true
 

From ac96ee6586a2800c13dde2b25ac456c5d695d15b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:24:08 +0100
Subject: [PATCH 59/93] MON-74: Uses generic message parameter

---
 cloud/azure/app-services/inputs.tf                | 12 ++++++++----
 cloud/azure/app-services/monitors-app_services.tf |  8 ++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 8af09cb..4ad908b 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -1,19 +1,23 @@
-variable "environment" {}
+variable "environment" {
+  description = "Architecture environment"
+  type        = "string"
+}
 
 variable "use_filter_tags" {
   description = "Filter the data with service tags if true"
   default     = "true"
 }
 
-variable "critical_escalation_group" {
-  default = "HO_Dummy"
+variable "message" {
+  description = "Message sent when a monitor is triggered"
 }
 
 ###################################
 ###   RESPONSE TIME VARIABLES   ###
 ###################################
 variable "response_time_appserv_eval_delay" {
-  default = 600
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
 }
 
 variable "response_time_threshold_critical" {
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 48b8184..9447cb4 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -10,7 +10,7 @@ data "template_file" "filter" {
 resource "datadog_monitor" "appservices_response_time" {
   name               = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
   type               = "query alert"
-  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  message            = "${var.message}"
   escalation_message = "${var.response_time_escalation_message}"
 
   query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}"
@@ -36,7 +36,7 @@ resource "datadog_monitor" "appservices_response_time" {
 resource "datadog_monitor" "appservices_memory_usage_count" {
   name               = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
   type               = "query alert"
-  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  message            = "${var.message}"
   escalation_message = "${var.memory_usage_escalation_message}"
 
   query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}"
@@ -62,7 +62,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
 resource "datadog_monitor" "appservices_http_404_errors_count" {
   name               = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
   type               = "query alert"
-  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  message            = "${var.message}"
   escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
 
   query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
@@ -88,7 +88,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
 resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   name               = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
   type               = "query alert"
-  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  message            = "${var.message}"
   escalation_message = "${var.http_2xx_status_rate_escalation_message}"
 
   query            = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"

From 31e036a8055c1404cb6b74808a701652fef42c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:27:09 +0100
Subject: [PATCH 60/93] MON-74: Readme

---
 cloud/azure/app-services/README.md | 83 ++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 cloud/azure/app-services/README.md

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
new file mode 100644
index 0000000..443c819
--- /dev/null
+++ b/cloud/azure/app-services/README.md
@@ -0,0 +1,83 @@
+Azure AppServices (Web, API, Functions) DataDog monitors
+========================================================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-app-services" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+  client_name = "${var.client_name}"
+}
+```
+
+Purpose
+-------
+Creates a DataDog monitors with the following checks :
+
+* Response time
+* Memory usage count
+* HTTP 404 errors
+* HTTP 50x errors
+* HTTP 20x rate
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| environment | Architecture environment | string | - | yes |
+| http_2xx_status_rate_appserv_eval_delay |  | string | `600` | no |
+| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_2xx_status_rate_limit | ################################ ##   HTTP 202 status pages   ### ################################ | string | `30` | no |
+| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
+| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
+| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
+| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| http_404_errors_count_rate_appserv_eval_delay |  | string | `600` | no |
+| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_404_errors_count_rate_limit | ################################ ##   HTTP 404 status pages   ### ################################ | string | `30` | no |
+| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
+| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
+| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| memory_usage_appserv_eval_delay | ################################## ##   MEMORY USAGE VARIABLES   ### ################################## | string | `600` | no |
+| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
+| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
+| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| message | Message sent when a monitor is triggered | string | - | yes |
+| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
+| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
+| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
+| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
+
+Related documentation
+---------------------
+
+DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services

From 98f5b6f331f381b9c7300f12036ac34320d0a718 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= <jerome.respaut@fr.clara.net>
Date: Tue, 31 Oct 2017 11:31:55 +0100
Subject: [PATCH 61/93] MON-74: Readme update

---
 cloud/azure/app-services/README.md | 165 ++++++++++++++---------------
 cloud/azure/app-services/inputs.tf |   4 +
 2 files changed, 86 insertions(+), 83 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index 443c819..d8a02c7 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -1,83 +1,82 @@
-Azure AppServices (Web, API, Functions) DataDog monitors
-========================================================
-
-How to use this module
-----------------------
-
-```
-module "datadog-monitors-azure-app-services" {
-  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}"
-
-  message = "${module.datadog-message-alerting.alerting-message}"
-
-  environment = "${var.environment}"
-  client_name = "${var.client_name}"
-}
-```
-
-Purpose
--------
-Creates a DataDog monitors with the following checks :
-
-* Response time
-* Memory usage count
-* HTTP 404 errors
-* HTTP 50x errors
-* HTTP 20x rate
-
-Inputs
-------
-
-| Name | Description | Type | Default | Required |
-|------|-------------|:----:|:-----:|:-----:|
-| environment | Architecture environment | string | - | yes |
-| http_2xx_status_rate_appserv_eval_delay |  | string | `600` | no |
-| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
-| http_2xx_status_rate_limit | ################################ ##   HTTP 202 status pages   ### ################################ | string | `30` | no |
-| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
-| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
-| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
-| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
-| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| http_404_errors_count_rate_appserv_eval_delay |  | string | `600` | no |
-| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
-| http_404_errors_count_rate_limit | ################################ ##   HTTP 404 status pages   ### ################################ | string | `30` | no |
-| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
-| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
-| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
-| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
-| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| memory_usage_appserv_eval_delay | ################################## ##   MEMORY USAGE VARIABLES   ### ################################## | string | `600` | no |
-| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
-| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
-| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
-| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
-| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| message | Message sent when a monitor is triggered | string | - | yes |
-| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no |
-| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
-| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
-| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
-| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
-| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
-
-
-Related documentation
----------------------
-
-DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services
+Azure AppServices (Web, API, Functions) DataDog monitors
+========================================================
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure-app-services" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}"
+
+  message = "${module.datadog-message-alerting.alerting-message}"
+
+  environment = "${var.environment}"
+  client_name = "${var.client_name}"
+}
+```
+
+Purpose
+-------
+Creates a DataDog monitors with the following checks :
+
+* Response time
+* Memory usage count
+* HTTP 404 errors
+* HTTP 50x errors
+* HTTP 20x rate
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| environment | Architecture environment | string | - | yes |
+| http_2xx_status_rate_appserv_eval_delay |  | string | `600` | no |
+| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_2xx_status_rate_limit |  | string | `30` | no |
+| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
+| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
+| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
+| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| http_404_errors_count_rate_appserv_eval_delay |  | string | `600` | no |
+| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_404_errors_count_rate_limit |  | string | `30` | no |
+| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
+| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
+| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| memory_usage_appserv_eval_delay |  | string | `600` | no |
+| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
+| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
+| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| message | Message sent when a monitor is triggered | string | - | yes |
+| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
+| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
+| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
+| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
+| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
+| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
+| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
+Related documentation
+---------------------
+
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure_app_services](https://docs.datadoghq.com/integrations/azure_app_services)
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 4ad908b..4f2a693 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -15,6 +15,7 @@ variable "message" {
 ###################################
 ###   RESPONSE TIME VARIABLES   ###
 ###################################
+
 variable "response_time_appserv_eval_delay" {
   description = "Delay in seconds for the metric evaluation"
   default     = 600
@@ -73,6 +74,7 @@ variable "response_time_escalation_message" {
 ###################################
 ###   MEMORY USAGE VARIABLES   ###
 ###################################
+
 variable "memory_usage_appserv_eval_delay" {
   default = 600
 }
@@ -130,6 +132,7 @@ variable "memory_usage_escalation_message" {
 #################################
 ###   HTTP 404 status pages   ###
 #################################
+
 variable "http_404_errors_count_rate_limit" {
   default = 30
 }
@@ -191,6 +194,7 @@ variable "http_404_errors_count_rate_escalation_message" {
 #################################
 ###   HTTP 202 status pages   ###
 #################################
+
 variable "http_2xx_status_rate_limit" {
   default = 30
 }

From dc06fb9519175c55c6d12b60ceea20a71ac4af0e Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Tue, 31 Oct 2017 15:28:41 +0100
Subject: [PATCH 62/93] MON-74 Add EOF on querys

---
 .../app-services/monitors-app_services.tf     | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 9447cb4..c42ad6c 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -13,7 +13,11 @@ resource "datadog_monitor" "appservices_response_time" {
   message            = "${var.message}"
   escalation_message = "${var.response_time_escalation_message}"
 
-  query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}"
+  query = <<EOF
+    avg(last_${var.response_time_last_time_window_code}): (
+      avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}}
+    ) >= ${var.response_time_threshold_critical}
+  EOF
 
   evaluation_delay = "${var.response_time_appserv_eval_delay}"
 
@@ -39,7 +43,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
   message            = "${var.message}"
   escalation_message = "${var.memory_usage_escalation_message}"
 
-  query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}"
+  query = <<EOF
+    avg(last_${var.memory_usage_last_time_window_code}): (
+      avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}}
+    ) >= ${var.memory_usage_threshold_critical}
+  EOF
 
   evaluation_delay = "${var.memory_usage_appserv_eval_delay}"
 
@@ -65,7 +73,11 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
   message            = "${var.message}"
   escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
 
-  query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
+  query = <<EOF
+    max(last_${var.http_404_errors_count_rate_last_time_window_code}): (
+      per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate())
+    ) > ${var.http_404_errors_count_rate_threshold_critical}
+  EOF
 
   evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
 
@@ -91,7 +103,13 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   message            = "${var.message}"
   escalation_message = "${var.http_2xx_status_rate_escalation_message}"
 
-  query            = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
+  query            = <<EOF
+    avg(last_${var.http_2xx_status_rate_last_time_window_code}): (
+      avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
+        avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
+    ) < ${var.http_2xx_status_rate_threshold_critical}
+  EOF
+
   evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
 
   thresholds {

From 7f0821b8bc516e6f70ba4dbeaaf4c580036c6067 Mon Sep 17 00:00:00 2001
From: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Date: Thu, 2 Nov 2017 16:54:18 +0100
Subject: [PATCH 63/93] MON-74 Fix changes to fit as the other modules

---
 cloud/azure/app-services/README.md            | 80 +++++++++--------
 cloud/azure/app-services/inputs.tf            | 86 +++----------------
 .../app-services/monitors-app_services.tf     | 60 ++++++-------
 3 files changed, 86 insertions(+), 140 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index d8a02c7..90f5882 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -28,52 +28,64 @@ Creates a DataDog monitors with the following checks :
 Inputs
 ------
 
-| Name | Description | Type | Default | Required |
-|------|-------------|:----:|:-----:|:-----:|
+| Name | Description | Type | Default | Required |                                                                     DESKTOP-0PBDRFR:  ~
+|------|-------------|:----:|:-----:|:-----:|                                                                          →
+| client_name | Client Name | string | - | yes |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| http_2xx_status_rate_appserv_eval_delay |  | string | `600` | no |
-| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#
+m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
 | http_2xx_status_rate_limit |  | string | `30` | no |
-| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
-| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef
+ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s
+kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true`
+| no |
+| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m
+onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying
+ via the API | string | `<list>` | no |
 | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
 | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
-| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| http_404_errors_count_rate_appserv_eval_delay |  | string | `600` | no |
-| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically
+resolve from a triggered state. Defaults to false. | string | `false` | no |
+| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write
+last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
 | http_404_errors_count_rate_limit |  | string | `30` | no |
-| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
-| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da
+ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil
+l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `
+true` | no |
+| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi
+lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu
+erying via the API | string | `<list>` | no |
 | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
 | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
-| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
-| memory_usage_appserv_eval_delay |  | string | `600` | no |
-| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
-| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati
+cally resolve from a triggered state. Defaults to false. | string | `false` | no |
+| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5,
+ 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's
+ evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
+Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
+in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the
+ API | string | `<list>` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
-| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
+from a triggered state. Defaults to false. | string | `false` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
-| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no |
-| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
-| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
-| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
-| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
-| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
+| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5
+, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
+| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it'
+s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
+ Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
+| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
+ in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th
+e API | string | `<list>` | no |
 | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
 | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
-| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
+| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
+ from a triggered state. Defaults to false. | string | `false` | no |
 | use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 
 Related documentation
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 4f2a693..5f0f2b0 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -3,6 +3,11 @@ variable "environment" {
   type        = "string"
 }
 
+variable "client_name" {
+  description = "Client Name"
+  type        = "string"
+}
+
 variable "use_filter_tags" {
   description = "Filter the data with service tags if true"
   default     = "true"
@@ -12,15 +17,15 @@ variable "message" {
   description = "Message sent when a monitor is triggered"
 }
 
-###################################
-###   RESPONSE TIME VARIABLES   ###
-###################################
-
-variable "response_time_appserv_eval_delay" {
+variable "delay" {
   description = "Delay in seconds for the metric evaluation"
   default     = 600
 }
 
+###################################
+###   RESPONSE TIME VARIABLES   ###
+###################################
+
 variable "response_time_threshold_critical" {
   default     = 0.8
   description = "Alerting threshold in seconds"
@@ -51,34 +56,15 @@ variable "response_time_timeout_h" {
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
-variable "response_time_include_tags" {
-  default     = false
-  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
-}
-
 # variable "response_time_notify_no_data" {
 #   default = true
 #   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
 # }
 
-variable "response_time_renotify_interval" {
-  default     = 0
-  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
-}
-
-variable "response_time_escalation_message" {
-  default     = "Escalation message @pagerduty"
-  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
-}
-
 ###################################
 ###   MEMORY USAGE VARIABLES   ###
 ###################################
 
-variable "memory_usage_appserv_eval_delay" {
-  default = 600
-}
-
 variable "memory_usage_threshold_critical" {
   default     = 52430000
   description = "Alerting threshold in Mib"
@@ -109,26 +95,11 @@ variable "memory_usage_timeout_h" {
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
-variable "memory_usage_include_tags" {
-  default     = false
-  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
-}
-
 # variable "memory_usage_notify_no_data" {
 #   default = true
 #   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
 # }
 
-variable "memory_usage_renotify_interval" {
-  default     = 0
-  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
-}
-
-variable "memory_usage_escalation_message" {
-  default     = "Escalation message @pagerduty"
-  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
-}
-
 #################################
 ###   HTTP 404 status pages   ###
 #################################
@@ -137,10 +108,6 @@ variable "http_404_errors_count_rate_limit" {
   default = 30
 }
 
-variable "http_404_errors_count_rate_appserv_eval_delay" {
-  default = 600
-}
-
 variable "http_404_errors_count_rate_threshold_critical" {
   default     = 30
   description = "Alerting threshold (number of requests)"
@@ -171,26 +138,11 @@ variable "http_404_errors_count_rate_timeout_h" {
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
-variable "http_404_errors_count_rate_include_tags" {
-  default     = false
-  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
-}
-
 # variable "http_404_errors_count_rate_notify_no_data" {
 #   default = true
 #   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
 # }
 
-variable "http_404_errors_count_rate_renotify_interval" {
-  default     = 0
-  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
-}
-
-variable "http_404_errors_count_rate_escalation_message" {
-  default     = "Escalation message @pagerduty"
-  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
-}
-
 #################################
 ###   HTTP 202 status pages   ###
 #################################
@@ -199,10 +151,6 @@ variable "http_2xx_status_rate_limit" {
   default = 30
 }
 
-variable "http_2xx_status_rate_appserv_eval_delay" {
-  default = 600
-}
-
 variable "http_2xx_status_rate_threshold_critical" {
   default     = 0.9
   description = "Alerting threshold (percentage)"
@@ -233,22 +181,8 @@ variable "http_2xx_status_rate_timeout_h" {
   description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
 }
 
-variable "http_2xx_status_rate_include_tags" {
-  default     = false
-  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
-}
-
 # variable "http_2xx_status_rate_notify_no_data" {
 #   default = true
 #   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
 # }
 
-variable "http_2xx_status_rate_renotify_interval" {
-  default     = 0
-  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
-}
-
-variable "http_2xx_status_rate_escalation_message" {
-  default     = "Escalation message @pagerduty"
-  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
-}
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index c42ad6c..437b7fb 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -8,10 +8,9 @@ data "template_file" "filter" {
 
 # Monitoring App Services response time
 resource "datadog_monitor" "appservices_response_time" {
-  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
-  type               = "query alert"
-  message            = "${var.message}"
-  escalation_message = "${var.response_time_escalation_message}"
+  name    = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
+  type    = "query alert"
+  message = "${var.message}"
 
   query = <<EOF
     avg(last_${var.response_time_last_time_window_code}): (
@@ -19,7 +18,8 @@ resource "datadog_monitor" "appservices_response_time" {
     ) >= ${var.response_time_threshold_critical}
   EOF
 
-  evaluation_delay = "${var.response_time_appserv_eval_delay}"
+  evaluation_delay = "${var.delay}"
+  new_host_delay   = "${var.delay}"
 
   thresholds {
     warning  = "${var.response_time_threshold_warning}"
@@ -27,21 +27,20 @@ resource "datadog_monitor" "appservices_response_time" {
   }
 
   notify_no_data      = true                                       # Will notify when no data is received
-  renotify_interval   = "${var.response_time_renotify_interval}"
+  renotify_interval   = 0
   require_full_window = "${var.response_time_require_full_window}"
 
   timeout_h    = "${var.response_time_timeout_h}"
-  include_tags = "${var.response_time_include_tags}"
+  include_tags = true
 
   tags = "${var.response_time_tags}"
 }
 
 # Monitoring App Services memory usage
 resource "datadog_monitor" "appservices_memory_usage_count" {
-  name               = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
-  type               = "query alert"
-  message            = "${var.message}"
-  escalation_message = "${var.memory_usage_escalation_message}"
+  name    = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
+  type    = "query alert"
+  message = "${var.message}"
 
   query = <<EOF
     avg(last_${var.memory_usage_last_time_window_code}): (
@@ -49,7 +48,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
     ) >= ${var.memory_usage_threshold_critical}
   EOF
 
-  evaluation_delay = "${var.memory_usage_appserv_eval_delay}"
+  evaluation_delay = "${var.delay}"
+  new_host_delay   = "${var.delay}"
 
   thresholds {
     warning  = "${var.memory_usage_threshold_warning}"
@@ -57,21 +57,20 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
   }
 
   notify_no_data      = true                                      # Will notify when no data is received
-  renotify_interval   = "${var.memory_usage_renotify_interval}"
+  renotify_interval   = 0
   require_full_window = "${var.memory_usage_require_full_window}"
 
   timeout_h    = "${var.memory_usage_timeout_h}"
-  include_tags = "${var.memory_usage_include_tags}"
+  include_tags = true
 
   tags = "${var.memory_usage_tags}"
 }
 
 # Monitoring App Services 404 errors rate
 resource "datadog_monitor" "appservices_http_404_errors_count" {
-  name               = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
-  type               = "query alert"
-  message            = "${var.message}"
-  escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
+  name    = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
+  type    = "query alert"
+  message = "${var.message}"
 
   query = <<EOF
     max(last_${var.http_404_errors_count_rate_last_time_window_code}): (
@@ -79,38 +78,39 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
     ) > ${var.http_404_errors_count_rate_threshold_critical}
   EOF
 
-  evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
+  evaluation_delay = "${var.delay}"
+  new_host_delay   = "${var.delay}"
 
   thresholds {
     warning  = "${var.http_404_errors_count_rate_threshold_warning}"
     critical = "${var.http_404_errors_count_rate_threshold_critical}"
   }
 
-  notify_no_data      = false                                                 # Will NOT notify when no data is received
-  renotify_interval   = "${var.http_404_errors_count_rate_renotify_interval}"
+  notify_no_data      = false # Will NOT notify when no data is received
+  renotify_interval   = 0
   require_full_window = true
 
   timeout_h    = "${var.http_404_errors_count_rate_timeout_h}"
-  include_tags = "${var.http_404_errors_count_rate_include_tags}"
+  include_tags = true
 
   tags = "${var.http_404_errors_count_rate_tags}"
 }
 
 # Monitoring App Services HTTP 2xx status pages rate
 resource "datadog_monitor" "appservices_http_2xx_status_rate" {
-  name               = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
-  type               = "query alert"
-  message            = "${var.message}"
-  escalation_message = "${var.http_2xx_status_rate_escalation_message}"
+  name    = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
+  type    = "query alert"
+  message = "${var.message}"
 
-  query            = <<EOF
+  query = <<EOF
     avg(last_${var.http_2xx_status_rate_last_time_window_code}): (
       avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
         avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
     ) < ${var.http_2xx_status_rate_threshold_critical}
   EOF
 
-  evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
+  evaluation_delay = "${var.delay}"
+  new_host_delay   = "${var.delay}"
 
   thresholds {
     warning  = "${var.http_2xx_status_rate_threshold_warning}"
@@ -119,11 +119,11 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
 
   # Will notify when no data is received
   notify_no_data      = true
-  renotify_interval   = "${var.http_2xx_status_rate_renotify_interval}"
+  renotify_interval   = 0
   require_full_window = true
 
   timeout_h    = "${var.http_2xx_status_rate_timeout_h}"
-  include_tags = "${var.http_2xx_status_rate_include_tags}"
+  include_tags = true
 
   tags = "${var.http_2xx_status_rate_tags}"
 }

From 012d16b77a7dddef1b66176295fe075ef516c01d Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 17:52:01 +0100
Subject: [PATCH 64/93] MON-74 Normalize monitors

---
 cloud/azure/app-services/README.md            |  54 +--------
 cloud/azure/app-services/inputs.tf            | 113 +-----------------
 .../app-services/monitors-app_services.tf     |  63 +++++-----
 3 files changed, 40 insertions(+), 190 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index 90f5882..e56fac2 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -8,10 +8,8 @@ How to use this module
 module "datadog-monitors-azure-app-services" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}"
 
-  message = "${module.datadog-message-alerting.alerting-message}"
-
+  message     = "${module.datadog-message-alerting.alerting-message}"
   environment = "${var.environment}"
-  client_name = "${var.client_name}"
 }
 ```
 
@@ -28,65 +26,23 @@ Creates a DataDog monitors with the following checks :
 Inputs
 ------
 
-| Name | Description | Type | Default | Required |                                                                     DESKTOP-0PBDRFR:  ~
-|------|-------------|:----:|:-----:|:-----:|                                                                          →
-| client_name | Client Name | string | - | yes |
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#
-m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | http_2xx_status_rate_limit |  | string | `30` | no |
-| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef
-ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s
-kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true`
-| no |
-| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m
-onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying
- via the API | string | `<list>` | no |
 | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
 | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
-| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically
-resolve from a triggered state. Defaults to false. | string | `false` | no |
-| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write
-last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
 | http_404_errors_count_rate_limit |  | string | `30` | no |
-| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da
-ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil
-l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `
-true` | no |
-| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi
-lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu
-erying via the API | string | `<list>` | no |
 | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
 | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
-| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati
-cally resolve from a triggered state. Defaults to false. | string | `false` | no |
-| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5,
- 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
-| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's
- evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
-Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
-in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the
- API | string | `<list>` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
-| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
-from a triggered state. Defaults to false. | string | `false` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
-| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5
-, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
-| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it'
-s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
- Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
-| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
- in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th
-e API | string | `<list>` | no |
 | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
 | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
-| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
- from a triggered state. Defaults to false. | string | `false` | no |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 5f0f2b0..c4bc451 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -3,14 +3,14 @@ variable "environment" {
   type        = "string"
 }
 
-variable "client_name" {
-  description = "Client Name"
-  type        = "string"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
-  default     = "true"
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
 }
 
 variable "message" {
@@ -36,31 +36,6 @@ variable "response_time_threshold_warning" {
   description = "Warning threshold in seconds"
 }
 
-variable "response_time_last_time_window_code" {
-  default     = "1h"
-  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
-}
-
-variable "response_time_require_full_window" {
-  default     = false
-  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
-}
-
-variable "response_time_tags" {
-  default     = []
-  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
-}
-
-variable "response_time_timeout_h" {
-  default     = false
-  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
-}
-
-# variable "response_time_notify_no_data" {
-#   default = true
-#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-# }
-
 ###################################
 ###   MEMORY USAGE VARIABLES   ###
 ###################################
@@ -75,31 +50,6 @@ variable "memory_usage_threshold_warning" {
   description = "Warning threshold in MiB"
 }
 
-variable "memory_usage_last_time_window_code" {
-  default     = "5m"
-  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
-}
-
-variable "memory_usage_require_full_window" {
-  default     = false
-  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
-}
-
-variable "memory_usage_tags" {
-  default     = []
-  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
-}
-
-variable "memory_usage_timeout_h" {
-  default     = false
-  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
-}
-
-# variable "memory_usage_notify_no_data" {
-#   default = true
-#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-# }
-
 #################################
 ###   HTTP 404 status pages   ###
 #################################
@@ -118,31 +68,6 @@ variable "http_404_errors_count_rate_threshold_warning" {
   description = "Warning threshold (number of requests)"
 }
 
-variable "http_404_errors_count_rate_last_time_window_code" {
-  default     = "5m"
-  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
-}
-
-variable "http_404_errors_count_rate_require_full_window" {
-  default     = true
-  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
-}
-
-variable "http_404_errors_count_rate_tags" {
-  default     = []
-  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
-}
-
-variable "http_404_errors_count_rate_timeout_h" {
-  default     = false
-  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
-}
-
-# variable "http_404_errors_count_rate_notify_no_data" {
-#   default = true
-#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-# }
-
 #################################
 ###   HTTP 202 status pages   ###
 #################################
@@ -160,29 +85,3 @@ variable "http_2xx_status_rate_threshold_warning" {
   default     = 0.95
   description = "Warning threshold (percentage)"
 }
-
-variable "http_2xx_status_rate_last_time_window_code" {
-  default     = "5m"
-  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
-}
-
-variable "http_2xx_status_rate_require_full_window" {
-  default     = true
-  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
-}
-
-variable "http_2xx_status_rate_tags" {
-  default     = []
-  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
-}
-
-variable "http_2xx_status_rate_timeout_h" {
-  default     = false
-  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
-}
-
-# variable "http_2xx_status_rate_notify_no_data" {
-#   default = true
-#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-# }
-
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 437b7fb..1cff1af 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -2,18 +2,18 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
 # Monitoring App Services response time
 resource "datadog_monitor" "appservices_response_time" {
-  name    = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
-  type    = "query alert"
+  name    = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}"
+  type    = "metric alert"
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_${var.response_time_last_time_window_code}): (
+    avg(last_5m): (
       avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}}
     ) >= ${var.response_time_threshold_critical}
   EOF
@@ -26,24 +26,23 @@ resource "datadog_monitor" "appservices_response_time" {
     critical = "${var.response_time_threshold_critical}"
   }
 
-  notify_no_data      = true                                       # Will notify when no data is received
+  notify_no_data      = true  # Will notify when no data is received
   renotify_interval   = 0
-  require_full_window = "${var.response_time_require_full_window}"
+  require_full_window = true
+  timeout_h           = 0
+  include_tags        = true
 
-  timeout_h    = "${var.response_time_timeout_h}"
-  include_tags = true
-
-  tags = "${var.response_time_tags}"
+  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
 # Monitoring App Services memory usage
 resource "datadog_monitor" "appservices_memory_usage_count" {
-  name    = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
-  type    = "query alert"
+  name    = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}"
+  type    = "metric alert"
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_${var.memory_usage_last_time_window_code}): (
+    avg(last_5m): (
       avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}}
     ) >= ${var.memory_usage_threshold_critical}
   EOF
@@ -58,22 +57,21 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
 
   notify_no_data      = true                                      # Will notify when no data is received
   renotify_interval   = 0
-  require_full_window = "${var.memory_usage_require_full_window}"
+  require_full_window = true
+  timeout_h           = 0
+  include_tags        = true
 
-  timeout_h    = "${var.memory_usage_timeout_h}"
-  include_tags = true
-
-  tags = "${var.memory_usage_tags}"
+  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
 # Monitoring App Services 404 errors rate
 resource "datadog_monitor" "appservices_http_404_errors_count" {
-  name    = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
-  type    = "query alert"
+  name    = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}"
+  type    = "metric alert"
   message = "${var.message}"
 
   query = <<EOF
-    max(last_${var.http_404_errors_count_rate_last_time_window_code}): (
+    max(last_5m): (
       per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate())
     ) > ${var.http_404_errors_count_rate_threshold_critical}
   EOF
@@ -89,21 +87,20 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
   notify_no_data      = false # Will NOT notify when no data is received
   renotify_interval   = 0
   require_full_window = true
+  timeout_h           = 0
+  include_tags        = true
 
-  timeout_h    = "${var.http_404_errors_count_rate_timeout_h}"
-  include_tags = true
-
-  tags = "${var.http_404_errors_count_rate_tags}"
+  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
 # Monitoring App Services HTTP 2xx status pages rate
 resource "datadog_monitor" "appservices_http_2xx_status_rate" {
-  name    = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
-  type    = "query alert"
+  name    = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}"
+  type    = "metric alert"
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_${var.http_2xx_status_rate_last_time_window_code}): (
+    avg(last_5m): (
       avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
         avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
     ) < ${var.http_2xx_status_rate_threshold_critical}
@@ -117,13 +114,11 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
     critical = "${var.http_2xx_status_rate_threshold_critical}"
   }
 
-  # Will notify when no data is received
-  notify_no_data      = true
+  notify_no_data      = true  # Will notify when no data is received
   renotify_interval   = 0
   require_full_window = true
+  timeout_h           = 0
+  include_tags        = true
 
-  timeout_h    = "${var.http_2xx_status_rate_timeout_h}"
-  include_tags = true
-
-  tags = "${var.http_2xx_status_rate_tags}"
+  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }

From 28e2e87a93fa5b35d0a10b64086c19120baee530 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Thu, 23 Nov 2017 17:34:30 +0100
Subject: [PATCH 65/93] MON-75 Normalize monitors

---
 cloud/azure/sql-database/README.md            | 20 ++++++-----
 cloud/azure/sql-database/inputs.tf            | 26 ++++----------
 .../monitors-sql-database-basics.tf           | 34 ++++++++++++-------
 3 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md
index f135036..7d815e3 100644
--- a/cloud/azure/sql-database/README.md
+++ b/cloud/azure/sql-database/README.md
@@ -1,5 +1,5 @@
 Azure SQL Database DataDog monitors
-============================
+===================================
 
 How to use this module
 ----------------------
@@ -8,10 +8,8 @@ How to use this module
 module "datadog-monitors-azure-storage" {
   source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}"
 
-  message = "${module.datadog-message-alerting.alerting-message}"
-
+  message     = "${module.datadog-message-alerting.alerting-message}"
   environment = "${var.environment}"
-  client_name = "${var.client_name}"
 }
 ```
 
@@ -38,8 +36,14 @@ Inputs
 | dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
 | dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
 | environment | Architecture Environment | string | - | yes |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
-| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no |
-| service | Service monitored by this set of monitors | string | `sql-database` | no |
-| subscription_id | Azure account id used as filter for monitors | string | - | yes |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+
+Related documentation
+---------------------
+
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure_sql_database/](https://docs.datadoghq.com/integrations/azure_sql_database/)
+
+Azure SQL Database metrics documentation: [https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics](https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics)
+
diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf
index d3abe46..9ddab06 100644
--- a/cloud/azure/sql-database/inputs.tf
+++ b/cloud/azure/sql-database/inputs.tf
@@ -4,23 +4,6 @@ variable "environment" {
   type        = "string"
 }
 
-variable "subscription_id" {
-  description = "Azure account id used as filter for monitors"
-  type        = "string"
-}
-
-variable "provider" {
-  description = "Cloud provider which the monitor and its based metric depend on"
-  type        = "string"
-  default     = "azure"
-}
-
-variable "service" {
-  description = "Service monitored by this set of monitors"
-  type        = "string"
-  default     = "sql-database"
-}
-
 # Global DataDog
 variable "delay" {
   description = "Delay in seconds for the metric evaluation"
@@ -31,11 +14,16 @@ variable "message" {
   description = "Message sent when an alert is triggered"
 }
 
-variable "use_filter_tags" {
-  description = "Filter the data with service tags if true"
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
   default     = "true"
 }
 
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
 # Azure SQL Database specific
 
 variable "cpu_threshold_warning" {
diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf
index 5448b8a..1e75813 100644
--- a/cloud/azure/sql-database/monitors-sql-database-basics.tf
+++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf
@@ -2,13 +2,13 @@ data "template_file" "filter" {
   template = "$${filter}"
 
   vars {
-    filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_sqldb:enabled,env:%s",var.environment) : "*"}"
+    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_sqldatabase:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
   }
 }
 
 resource "datadog_monitor" "sql-database_cpu_90_15min" {
-  name    = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}"
-  message = "${message}"
+  name    = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}"
+  message = "${var.message}"
 
   query = <<EOF
     avg(last_15m): (
@@ -16,7 +16,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
     ) > ${var.cpu_threshold_critical}
   EOF
 
-  type = "query alert"
+  type = "metric alert"
 
   thresholds {
     critical = "${var.cpu_threshold_critical}"
@@ -32,13 +32,15 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "sql-database_free_space_low" {
-  name    = "[${var.environment}] SQL Database free space < 10 % on {{name}}"
-  message = "${message}"
+  name    = "[${var.environment}] SQL Database free space < ${var.diskspace_threshold_critical}% on {{name}}"
+  message = "${var.message}"
 
-  type = "query alert"
+  type = "metric alert"
 
   query = <<EOF
     avg(last_15m): (
@@ -61,13 +63,15 @@ resource "datadog_monitor" "sql-database_free_space_low" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "sql-database_dtu_consumption_high" {
-  name    = "[${var.environment}] DTU Consumption on {{name}} > 90"
-  message = "${message}"
+  name    = "[${var.environment}] SQL Database DTU Consumption on {{name}} > ${var.dtu_threshold_critical}"
+  message = "${var.message}"
 
-  type = "query alert"
+  type = "metric alert"
 
   query = <<EOF
     avg(last_15m): (
@@ -90,13 +94,15 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
 }
 
 resource "datadog_monitor" "sql-database_deadlocks_count" {
-  name    = "[${var.environment}] SQL Deadlocks too high on {{name}}"
-  message = "${message}"
+  name    = "[${var.environment}] SQL Database Deadlocks too high on {{name}}"
+  message = "${var.message}"
 
-  type = "query alert"
+  type = "metric alert"
 
   query = <<EOF
     sum(last_5m): (
@@ -118,4 +124,6 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
   require_full_window = true
   new_host_delay      = "${var.delay}"
   no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
 }

From 25c07babeacb36d0a3874436d4d73fbd7bf3796d Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Fri, 24 Nov 2017 16:53:25 +0100
Subject: [PATCH 66/93] MON-90 Azure API Management monitors

---
 cloud/azure/apimanagement/README.md           |  43 +++++
 cloud/azure/apimanagement/inputs.tf           |  46 +++++
 .../monitors-azure-apimanagement.tf           | 160 ++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 cloud/azure/apimanagement/README.md
 create mode 100644 cloud/azure/apimanagement/inputs.tf
 create mode 100644 cloud/azure/apimanagement/monitors-azure-apimanagement.tf

diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md
new file mode 100644
index 0000000..e59e81a
--- /dev/null
+++ b/cloud/azure/apimanagement/README.md
@@ -0,0 +1,43 @@
+Azure API Management Datadog monitors
+=====================================
+
+How to use this module
+----------------------
+```
+module "datadog-monitors-azure-apimanagement" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/apimanagement?ref={revision}"
+
+  message     = "${module.datadog-message-alerting.alerting-message}"
+  environment = "${var.environment}"
+}
+```
+
+Purpose
+-------
+Creates Datadog monitors with the following checks :
+
+* Service status
+* Failed requests ratio
+* Other requests ratio
+* Unauthorized requests ratio
+* Successful requests ratio
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
+| message | Message sent when a Redis monitor is triggered | string | - | yes |
+| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
+| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
+| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
+
+Related documentation
+---------------------
+
+Azure API Management metrics documentation: [https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor](https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor)
diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf
new file mode 100644
index 0000000..002593e
--- /dev/null
+++ b/cloud/azure/apimanagement/inputs.tf
@@ -0,0 +1,46 @@
+# Global Terraform
+variable "environment" {
+  description = "Architecture environment"
+  type = "string"
+}
+
+# Global DataDog
+variable "message" {
+  description = "Message sent when a Redis monitor is triggered"
+}
+
+variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
+}
+
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
+}
+
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
+# Azure API Management specific
+variable "failed_requests_threshold_critical" {
+  description = "Maximum acceptable percent of failed requests"
+  default = 5
+}
+
+variable "other_requests_threshold_critical" {
+  description = "Maximum acceptable percent of other requests"
+  default = 5
+}
+
+variable "unauthorized_requests_threshold_critical" {
+  description = "Maximum acceptable percent of unauthorized requests"
+  default = 5
+}
+
+variable "successful_requests_threshold_critical" {
+  description = "Minimum acceptable percent of successful requests"
+  default = 90
+}
diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
new file mode 100644
index 0000000..f7a55b1
--- /dev/null
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -0,0 +1,160 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.filter_tags_use_defaults == "true" ?
+             format("dd_monitoring:enabled,dd_azure_apimanagement:enabled,env:%s", var.environment) :
+             "${var.filter_tags_custom}"}"
+  }
+}
+
+resource "datadog_monitor" "apimgt_status" {
+  name    = "[${var.environment}] API Management status is not ok on {{name}}"
+  message = "${var.message}"
+
+  query = <<EOF
+      avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {name,resource_group,region} < 1
+      EOF
+  type  = "metric alert"
+
+  thresholds {
+    critical = 1
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_failed_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much failed requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.failed_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.failed_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_other_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much other requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.other_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.other_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_unauthorized_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much unauthorized requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.unauthorized_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.unauthorized_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_successful_requests" {
+  name    = "[${var.environment}] API Management {{name}} successful requests rate too low"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) < ${var.successful_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.successful_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = true
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}

From 4edcc82e10c4095607394d9542a9ffb7730fcccf Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Sun, 26 Nov 2017 20:17:14 +0100
Subject: [PATCH 67/93] MON-80 convert all as_count queries to sum

---
 cloud/azure/iothubs/monitors-iothubs.tf | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 6e1f926..9719e59 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -11,7 +11,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
   message = "${var.message}"
 
   query = <<EOF
-          avg(last_5m):(
+          sum(last_5m):(
             avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
             ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
                 avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
@@ -44,7 +44,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
   message = "${var.message}"
 
   query = <<EOF
-          avg(last_5m):(
+          sum(last_5m):(
             avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
               ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
                   avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
@@ -77,7 +77,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
         ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
             avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
@@ -158,7 +158,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
             avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
@@ -191,7 +191,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
             avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
@@ -224,7 +224,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
       ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
@@ -257,7 +257,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
@@ -290,7 +290,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):(
+    sum(last_5m):(
       avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
         ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
           avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )

From 178813dd5c9ff9751e5c2ee113a31bb888c02d21 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Sun, 26 Nov 2017 20:26:18 +0100
Subject: [PATCH 68/93] MON-74 convert all as_count queries to sum

---
 cloud/azure/app-services/monitors-app_services.tf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 1cff1af..2c2f80e 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -100,7 +100,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
         avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
     ) < ${var.http_2xx_status_rate_threshold_critical}
@@ -114,7 +114,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
     critical = "${var.http_2xx_status_rate_threshold_critical}"
   }
 
-  notify_no_data      = true  # Will notify when no data is received
+  notify_no_data      = false  # Will notify when no data is received
   renotify_interval   = 0
   require_full_window = true
   timeout_h           = 0

From c6482bf00c4c7bd1c2bb78c3750f610a897bb7bf Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Sun, 26 Nov 2017 20:32:35 +0100
Subject: [PATCH 69/93] MON-78 convert all as_count queries to sum

---
 cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index f72af1f..3ad187f 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -63,7 +63,7 @@ resource "datadog_monitor" "failed_function_requests" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
        avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
     ) * 100 > ${var.failed_function_requests_threshold_critical}

From 6b9c03947ad23b7e991c8b453582f56e93304db9 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Sun, 26 Nov 2017 20:37:22 +0100
Subject: [PATCH 70/93] MON-90 change no data to false because division

---
 cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
index f7a55b1..82c3df7 100644
--- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -145,7 +145,7 @@ resource "datadog_monitor" "apimgt_successful_requests" {
   }
 
   type                = "metric alert"
-  notify_no_data      = true
+  notify_no_data      = false
   notify_audit        = false
   timeout_h           = 0
   include_tags        = true

From d3bbb3ced5c4309f5b71622c79ff754d6bdd487a Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Fri, 24 Nov 2017 16:53:25 +0100
Subject: [PATCH 71/93] MON-90 Azure API Management monitors

---
 cloud/azure/apimanagement/README.md           |  43 +++++
 cloud/azure/apimanagement/inputs.tf           |  46 +++++
 .../monitors-azure-apimanagement.tf           | 160 ++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 cloud/azure/apimanagement/README.md
 create mode 100644 cloud/azure/apimanagement/inputs.tf
 create mode 100644 cloud/azure/apimanagement/monitors-azure-apimanagement.tf

diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md
new file mode 100644
index 0000000..e59e81a
--- /dev/null
+++ b/cloud/azure/apimanagement/README.md
@@ -0,0 +1,43 @@
+Azure API Management Datadog monitors
+=====================================
+
+How to use this module
+----------------------
+```
+module "datadog-monitors-azure-apimanagement" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/apimanagement?ref={revision}"
+
+  message     = "${module.datadog-message-alerting.alerting-message}"
+  environment = "${var.environment}"
+}
+```
+
+Purpose
+-------
+Creates Datadog monitors with the following checks :
+
+* Service status
+* Failed requests ratio
+* Other requests ratio
+* Unauthorized requests ratio
+* Successful requests ratio
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
+| message | Message sent when a Redis monitor is triggered | string | - | yes |
+| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
+| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
+| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
+
+Related documentation
+---------------------
+
+Azure API Management metrics documentation: [https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor](https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor)
diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf
new file mode 100644
index 0000000..002593e
--- /dev/null
+++ b/cloud/azure/apimanagement/inputs.tf
@@ -0,0 +1,46 @@
+# Global Terraform
+variable "environment" {
+  description = "Architecture environment"
+  type = "string"
+}
+
+# Global DataDog
+variable "message" {
+  description = "Message sent when a Redis monitor is triggered"
+}
+
+variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
+}
+
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
+}
+
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
+# Azure API Management specific
+variable "failed_requests_threshold_critical" {
+  description = "Maximum acceptable percent of failed requests"
+  default = 5
+}
+
+variable "other_requests_threshold_critical" {
+  description = "Maximum acceptable percent of other requests"
+  default = 5
+}
+
+variable "unauthorized_requests_threshold_critical" {
+  description = "Maximum acceptable percent of unauthorized requests"
+  default = 5
+}
+
+variable "successful_requests_threshold_critical" {
+  description = "Minimum acceptable percent of successful requests"
+  default = 90
+}
diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
new file mode 100644
index 0000000..f7a55b1
--- /dev/null
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -0,0 +1,160 @@
+data "template_file" "filter" {
+  template = "$${filter}"
+
+  vars {
+    filter = "${var.filter_tags_use_defaults == "true" ?
+             format("dd_monitoring:enabled,dd_azure_apimanagement:enabled,env:%s", var.environment) :
+             "${var.filter_tags_custom}"}"
+  }
+}
+
+resource "datadog_monitor" "apimgt_status" {
+  name    = "[${var.environment}] API Management status is not ok on {{name}}"
+  message = "${var.message}"
+
+  query = <<EOF
+      avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {name,resource_group,region} < 1
+      EOF
+  type  = "metric alert"
+
+  thresholds {
+    critical = 1
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_failed_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much failed requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.failed_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.failed_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_other_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much other requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.other_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.other_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_unauthorized_requests" {
+  name    = "[${var.environment}] API Management {{name}} too much unauthorized requests"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) > ${var.unauthorized_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.unauthorized_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = false
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}
+
+resource "datadog_monitor" "apimgt_successful_requests" {
+  name    = "[${var.environment}] API Management {{name}} successful requests rate too low"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} * 100 /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
+      by {name,resource_group,region}
+    ) < ${var.successful_requests_threshold_critical}
+    EOF
+
+  thresholds {
+    critical  = "${var.successful_requests_threshold_critical}"
+  }
+
+  type                = "metric alert"
+  notify_no_data      = true
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = true
+  new_host_delay      = "${var.delay}"
+  evaluation_delay    = "${var.delay}"
+  renotify_interval   = 0
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"]
+}

From 2cb60cd6423d463fcba08743e8743b84d4701693 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:00:12 +0100
Subject: [PATCH 72/93] MON-74 add group by to all queries

---
 cloud/azure/app-services/monitors-app_services.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 2c2f80e..6bf3fd6 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -14,7 +14,7 @@ resource "datadog_monitor" "appservices_response_time" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}}
+      avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) >= ${var.response_time_threshold_critical}
   EOF
 
@@ -43,7 +43,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}}
+      avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) >= ${var.memory_usage_threshold_critical}
   EOF
 
@@ -72,7 +72,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
 
   query = <<EOF
     max(last_5m): (
-      per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate())
+      per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) by {resource_group,region,name}
     ) > ${var.http_404_errors_count_rate_threshold_critical}
   EOF
 
@@ -102,7 +102,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   query = <<EOF
     sum(last_5m): (
       avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
-        avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
+        avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() by {resource_group,region,name}
     ) < ${var.http_2xx_status_rate_threshold_critical}
   EOF
 

From 6ed22d6c651ad9c0fd2c9dc1bf878270a611adbb Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:02:24 +0100
Subject: [PATCH 73/93] MON-75 add region to all group by

---
 cloud/azure/sql-database/monitors-sql-database-basics.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf
index 1e75813..337b28f 100644
--- a/cloud/azure/sql-database/monitors-sql-database-basics.tf
+++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf
@@ -12,7 +12,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
 
   query = <<EOF
     avg(last_15m): (
-      avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.cpu_threshold_critical}
   EOF
 
@@ -44,7 +44,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
 
   query = <<EOF
     avg(last_15m): (
-      avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.diskspace_threshold_critical}
   EOF
 
@@ -75,7 +75,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
 
   query = <<EOF
     avg(last_15m): (
-      azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group}
+      azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.dtu_threshold_critical}
   EOF
 
@@ -106,7 +106,7 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) > ${var.deadlock_threshold_critical}
   EOF
 

From 855e52a36fb1288dae41a356a032a56955dded62 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:03:14 +0100
Subject: [PATCH 74/93] MON-76 add region to all group by

---
 cloud/azure/redis/monitors-azure-redis.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf
index 57b3a6c..8e68558 100644
--- a/cloud/azure/redis/monitors-azure-redis.tf
+++ b/cloud/azure/redis/monitors-azure-redis.tf
@@ -11,7 +11,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1
+    avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
 EOF
 
   type = "metric alert"
@@ -36,7 +36,7 @@ resource "datadog_monitor" "evictedkeys" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {resource_group,region,name}
      ) > ${var.evictedkeys_limit_threshold_critical}
 EOF
 
@@ -67,7 +67,7 @@ resource "datadog_monitor" "percent_processor_time" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.percent_processor_time_threshold_critical}
 EOF
 
@@ -98,7 +98,7 @@ resource "datadog_monitor" "server_load" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.server_load_rate_threshold_critical}
 EOF
 

From 3934e869a1016bbe2aa1637e8bebaea359794d9b Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:19:33 +0100
Subject: [PATCH 75/93] MON-77 improve queries adding as_count

---
 cloud/azure/eventhub/monitors-eventhub.tf | 34 +++++++++++------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index ff52507..4627106 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -11,7 +11,7 @@ resource "datadog_monitor" "eventhub_status" {
   message = "${var.message}"
 
   query = <<EOF
-      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
+      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
       EOF
   type  = "metric alert"
 
@@ -34,12 +34,12 @@ resource "datadog_monitor" "eventhub_failed_requests" {
   message = "${var.message}"
 
   query = <<EOF
-        avg(last_5m): (
-          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
-        ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
-        ) > ${var.failed_requests_rate_thresold_critical}
+        sum(last_5m): (
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        (
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+        ) * 100 > ${var.failed_requests_rate_thresold_critical}
         EOF
   type  = "metric alert"
 
@@ -67,16 +67,16 @@ resource "datadog_monitor" "eventhub_errors" {
   message = "${var.message}"
 
   query = <<EOF
-        avg(last_5m): (
-          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
-        ) * 100 / (
-          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
-          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
-        ) > ${var.errors_rate_thresold_critical}
+        sum(last_5m): (
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+        ) / (
+          avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+        ) * 100 > ${var.errors_rate_thresold_critical}
         EOF
   type  = "metric alert"
 

From 00e1ada46ed9af49a0d6c7360e1e4f5e6406fc24 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:29:06 +0100
Subject: [PATCH 76/93] MON-74 fix percent query

---
 cloud/azure/app-services/monitors-app_services.tf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 6bf3fd6..aedc748 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -101,8 +101,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
-        avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() by {resource_group,region,name}
+      avg:azure.app_services.http2xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) < ${var.http_2xx_status_rate_threshold_critical}
   EOF
 

From 3dc052589c1ef8b964f2bd36cdd156e0d90f9adb Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:31:10 +0100
Subject: [PATCH 77/93] MON-78 reorder groupy from less to must specific

---
 .../stream-analytics/monitors-stream-analytics.tf    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index 3ad187f..fe4e983 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -11,7 +11,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
+    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,name} < 1
   EOF
   type  = "metric alert"
 
@@ -35,7 +35,7 @@ resource "datadog_monitor" "su_utilization" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,name}
     ) > ${var.su_utilization_threshold_critical}
   EOF
   type  = "metric alert"
@@ -64,8 +64,8 @@ resource "datadog_monitor" "failed_function_requests" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
+       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,name}.as_count()
     ) * 100 > ${var.failed_function_requests_threshold_critical}
   EOF
   type  = "metric alert"
@@ -94,7 +94,7 @@ resource "datadog_monitor" "conversion_errors" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,name}
     ) > ${var.conversion_errors_threshold_critical}
   EOF
   type  = "metric alert"
@@ -123,7 +123,7 @@ resource "datadog_monitor" "runtime_errors" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
+      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,name}
     ) > ${var.runtime_errors_threshold_critical}
   EOF
   type  = "metric alert"

From b30a0e2689716abcfb0de787cb0a8080f929c229 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:32:25 +0100
Subject: [PATCH 78/93] MON-80 add region to group by

---
 cloud/azure/iothubs/monitors-iothubs.tf | 52 ++++++++++++-------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 9719e59..9388f1c 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -12,9 +12,9 @@ resource "datadog_monitor" "too_many_jobs_failed" {
 
   query = <<EOF
           sum(last_5m):(
-            avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-            ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-                avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+            avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+            ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+                avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
           ) * 100 > ${var.failed_jobs_rate_threshold_critical}
   EOF
 
@@ -110,7 +110,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
+    avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
   EOF
 
   type = "metric alert"
@@ -134,7 +134,7 @@ resource "datadog_monitor" "total_devices" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0
+    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {resource_group,region,name} == 0
   EOF
 
   type = "metric alert"
@@ -159,9 +159,9 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-            avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+            avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
   EOF
 
@@ -192,9 +192,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-            avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+            avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
   EOF
 
@@ -225,9 +225,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-      ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
   EOF
 
@@ -258,9 +258,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
   EOF
 
@@ -291,9 +291,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
   EOF
 
@@ -324,7 +324,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
 
   query = <<EOF
       sum(last_5m): (
-        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
       ) > ${var.dropped_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -355,7 +355,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -386,7 +386,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) > ${var.invalid_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -417,7 +417,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     )  > ${var.fallback_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -448,8 +448,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() -
-        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() -
+        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) > 0
   EOF
 

From 835942e6e1942620826c5120eb3263a23b7038cd Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:33:15 +0100
Subject: [PATCH 79/93] MON-78 add region to group by

---
 .../stream-analytics/monitors-stream-analytics.tf    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
index fe4e983..3b1324a 100644
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@@ -11,7 +11,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,name} < 1
+    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
   EOF
   type  = "metric alert"
 
@@ -35,7 +35,7 @@ resource "datadog_monitor" "su_utilization" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,name}
+      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.su_utilization_threshold_critical}
   EOF
   type  = "metric alert"
@@ -64,8 +64,8 @@ resource "datadog_monitor" "failed_function_requests" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
-       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,name}.as_count()
+      avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
     ) * 100 > ${var.failed_function_requests_threshold_critical}
   EOF
   type  = "metric alert"
@@ -94,7 +94,7 @@ resource "datadog_monitor" "conversion_errors" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,name}
+      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.conversion_errors_threshold_critical}
   EOF
   type  = "metric alert"
@@ -123,7 +123,7 @@ resource "datadog_monitor" "runtime_errors" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,name}
+      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
     ) > ${var.runtime_errors_threshold_critical}
   EOF
   type  = "metric alert"

From 4d42b8832513824f8971d8d809807917ae79442d Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Mon, 27 Nov 2017 23:39:54 +0100
Subject: [PATCH 80/93] MON-90 fix queries syntax with as_count

---
 .../monitors-azure-apimanagement.tf           | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
index 82c3df7..c427d21 100644
--- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -13,7 +13,7 @@ resource "datadog_monitor" "apimgt_status" {
   message = "${var.message}"
 
   query = <<EOF
-      avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {name,resource_group,region} < 1
+      avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
       EOF
   type  = "metric alert"
 
@@ -41,9 +41,8 @@ resource "datadog_monitor" "apimgt_failed_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
-      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
-      by {name,resource_group,region}
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.failed_requests_threshold_critical}
     EOF
 
@@ -72,9 +71,8 @@ resource "datadog_monitor" "apimgt_other_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} * 100 /
-      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
-      by {name,resource_group,region}
+      avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.other_requests_threshold_critical}
     EOF
 
@@ -103,9 +101,8 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} * 100 /
-      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
-      by {name,resource_group,region}
+      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.unauthorized_requests_threshold_critical}
     EOF
 
@@ -134,9 +131,8 @@ resource "datadog_monitor" "apimgt_successful_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} * 100 /
-      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}}
-      by {name,resource_group,region}
+      avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) < ${var.successful_requests_threshold_critical}
     EOF
 

From 223bcaf4043fba56d47b367c3082367d6910e3c3 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Thu, 7 Dec 2017 10:40:04 +0100
Subject: [PATCH 81/93] MON-74 percent for requests

---
 cloud/azure/app-services/README.md            | 15 ++++------
 cloud/azure/app-services/inputs.tf            | 30 +++++++------------
 .../app-services/monitors-app_services.tf     | 23 +++++++-------
 3 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index e56fac2..ab49366 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -19,9 +19,8 @@ Creates a DataDog monitors with the following checks :
 
 * Response time
 * Memory usage count
-* HTTP 404 errors
-* HTTP 50x errors
-* HTTP 20x rate
+* HTTP 404 requests
+* HTTP 2xx requests
 
 Inputs
 ------
@@ -32,12 +31,10 @@ Inputs
 | environment | Architecture environment | string | - | yes |
 | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
-| http_2xx_status_rate_limit |  | string | `30` | no |
-| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
-| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
-| http_404_errors_count_rate_limit |  | string | `30` | no |
-| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
-| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no |
+| http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
+| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no |
+| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index c4bc451..541a0e7 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -54,34 +54,26 @@ variable "memory_usage_threshold_warning" {
 ###   HTTP 404 status pages   ###
 #################################
 
-variable "http_404_errors_count_rate_limit" {
-  default = 30
+variable "http_404_requests_threshold_critical" {
+  default     = 40
+  description = "Maximum critical acceptable percent of 404 errors"
 }
 
-variable "http_404_errors_count_rate_threshold_critical" {
+variable "http_404_requests_threshold_warning" {
   default     = 30
-  description = "Alerting threshold (number of requests)"
-}
-
-variable "http_404_errors_count_rate_threshold_warning" {
-  default     = 10
-  description = "Warning threshold (number of requests)"
+  description = "Maximum critical acceptable percent of 404 errors"
 }
 
 #################################
 ###   HTTP 202 status pages   ###
 #################################
 
-variable "http_2xx_status_rate_limit" {
-  default = 30
+variable "http_2xx_requests_threshold_critical" {
+  default     = 90
+  description = "Minimum critical acceptable percent of 2xx requests"
 }
 
-variable "http_2xx_status_rate_threshold_critical" {
-  default     = 0.9
-  description = "Alerting threshold (percentage)"
-}
-
-variable "http_2xx_status_rate_threshold_warning" {
-  default     = 0.95
-  description = "Warning threshold (percentage)"
+variable "http_2xx_requests_threshold_warning" {
+  default     = 95
+  description = "Minimum warning acceptable percent of 2xx requests"
 }
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index aedc748..0abc8fd 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -15,7 +15,7 @@ resource "datadog_monitor" "appservices_response_time" {
   query = <<EOF
     avg(last_5m): (
       avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
-    ) >= ${var.response_time_threshold_critical}
+    ) > ${var.response_time_threshold_critical}
   EOF
 
   evaluation_delay = "${var.delay}"
@@ -44,7 +44,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
   query = <<EOF
     avg(last_5m): (
       avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} by {resource_group,region,name}
-    ) >= ${var.memory_usage_threshold_critical}
+    ) > ${var.memory_usage_threshold_critical}
   EOF
 
   evaluation_delay = "${var.delay}"
@@ -71,17 +71,18 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
   message = "${var.message}"
 
   query = <<EOF
-    max(last_5m): (
-      per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) by {resource_group,region,name}
-    ) > ${var.http_404_errors_count_rate_threshold_critical}
+    sum(last_5m): (
+      avg:azure.app_services.http404{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+    ) * 100 > ${var.http_404_requests_threshold_critical}
   EOF
 
   evaluation_delay = "${var.delay}"
   new_host_delay   = "${var.delay}"
 
   thresholds {
-    warning  = "${var.http_404_errors_count_rate_threshold_warning}"
-    critical = "${var.http_404_errors_count_rate_threshold_critical}"
+    warning  = "${var.http_404_requests_threshold_warning}"
+    critical = "${var.http_404_requests_threshold_critical}"
   }
 
   notify_no_data      = false # Will NOT notify when no data is received
@@ -102,16 +103,16 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
   query = <<EOF
     sum(last_5m): (
       avg:azure.app_services.http2xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
-    ) < ${var.http_2xx_status_rate_threshold_critical}
+      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+    ) * 100 < ${var.http_2xx_requests_threshold_critical}
   EOF
 
   evaluation_delay = "${var.delay}"
   new_host_delay   = "${var.delay}"
 
   thresholds {
-    warning  = "${var.http_2xx_status_rate_threshold_warning}"
-    critical = "${var.http_2xx_status_rate_threshold_critical}"
+    warning  = "${var.http_2xx_requests_threshold_warning}"
+    critical = "${var.http_2xx_requests_threshold_critical}"
   }
 
   notify_no_data      = false  # Will notify when no data is received

From 3a4895fcc649fe8e7f9ed572247475c684f5011f Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Thu, 7 Dec 2017 10:43:23 +0100
Subject: [PATCH 82/93] MON-77 missing parenthesis

---
 cloud/azure/eventhub/monitors-eventhub.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf
index 4627106..bfb5775 100644
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@@ -35,7 +35,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
 
   query = <<EOF
         sum(last_5m): (
-          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() ) /
         (
           avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
           avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()

From 4919a71d110e1edc05dda81e1c4cd5b63f1405b2 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Thu, 7 Dec 2017 10:59:22 +0100
Subject: [PATCH 83/93] MON-90 fix failed to unauthorized requests

---
 cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
index c427d21..a1a0457 100644
--- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -101,7 +101,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
 
   query = <<EOF
     avg(last_5m): (
-      avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.apimanagement_service.unauthorized_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.unauthorized_requests_threshold_critical}
     EOF

From 886ae437f4ceffe7b8fbfd738724bc1638c2f0df Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 11 Dec 2017 11:41:32 +0100
Subject: [PATCH 84/93] MON-74 Fix non existent variable

---
 cloud/azure/app-services/monitors-app_services.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 0abc8fd..669e9db 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -66,7 +66,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
 
 # Monitoring App Services 404 errors rate
 resource "datadog_monitor" "appservices_http_404_errors_count" {
-  name    = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}"
+  name    = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}"
   type    = "metric alert"
   message = "${var.message}"
 

From 7de2bf4aca61736a28d143260e4d91799bca2ec6 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 13:13:09 +0100
Subject: [PATCH 85/93] MON-74 decrease thresholds for 404 errors

---
 cloud/azure/app-services/inputs.tf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 541a0e7..96c2892 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -55,17 +55,17 @@ variable "memory_usage_threshold_warning" {
 #################################
 
 variable "http_404_requests_threshold_critical" {
-  default     = 40
-  description = "Maximum critical acceptable percent of 404 errors"
-}
-
-variable "http_404_requests_threshold_warning" {
   default     = 30
   description = "Maximum critical acceptable percent of 404 errors"
 }
 
+variable "http_404_requests_threshold_warning" {
+  default     = 15
+  description = "Maximum warning acceptable percent of 404 errors"
+}
+
 #################################
-###   HTTP 202 status pages   ###
+###   HTTP 2xx status pages   ###
 #################################
 
 variable "http_2xx_requests_threshold_critical" {

From 6cb41b8fbb08f424f59ab13187028809d2b2a984 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 13:14:30 +0100
Subject: [PATCH 86/93] MON-74 fix response time monitor name

---
 cloud/azure/app-services/monitors-app_services.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 669e9db..d473fd5 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -8,7 +8,7 @@ data "template_file" "filter" {
 
 # Monitoring App Services response time
 resource "datadog_monitor" "appservices_response_time" {
-  name    = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}"
+  name    = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}"
   type    = "metric alert"
   message = "${var.message}"
 

From 3a56b974c106da967214dc387020623a5e427da5 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 13:28:11 +0100
Subject: [PATCH 87/93] MON-74 Add 5xx errors monitor

---
 cloud/azure/app-services/README.md            |  3 ++
 cloud/azure/app-services/inputs.tf            | 14 +++++++
 .../app-services/monitors-app_services.tf     | 38 +++++++++++++++++--
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index ab49366..b439492 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -19,6 +19,7 @@ Creates a DataDog monitors with the following checks :
 
 * Response time
 * Memory usage count
+* HTTP 5xx requests
 * HTTP 404 requests
 * HTTP 2xx requests
 
@@ -35,6 +36,8 @@ Inputs
 | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
 | http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no |
 | http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no |
+| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no |
+| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 96c2892..bc50156 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -50,6 +50,20 @@ variable "memory_usage_threshold_warning" {
   description = "Warning threshold in MiB"
 }
 
+#################################
+###   HTTP 5xx status pages   ###
+#################################
+
+variable "http_5xx_requests_threshold_critical" {
+  default     = 20
+  description = "Maximum critical acceptable percent of 5xx errors"
+}
+
+variable "http_5xx_requests_threshold_warning" {
+  default     = 10
+  description = "Maximum warning acceptable percent of 5xx errors"
+}
+
 #################################
 ###   HTTP 404 status pages   ###
 #################################
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index d473fd5..3f8b49b 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -64,9 +64,39 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
   tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
-# Monitoring App Services 404 errors rate
+# Monitoring App Services 5xx errors percent
+resource "datadog_monitor" "appservices_http_5xx_errors_count" {
+  name    = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}"
+  type    = "metric alert"
+  message = "${var.message}"
+
+  query = <<EOF
+    sum(last_5m): (
+      avg:azure.app_services.http5xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+    ) * 100 > ${var.http_5xx_requests_threshold_critical}
+  EOF
+
+  evaluation_delay = "${var.delay}"
+  new_host_delay   = "${var.delay}"
+
+  thresholds {
+    warning  = "${var.http_5xx_requests_threshold_warning}"
+    critical = "${var.http_5xx_requests_threshold_critical}"
+  }
+
+  notify_no_data      = false # Will NOT notify when no data is received
+  renotify_interval   = 0
+  require_full_window = true
+  timeout_h           = 0
+  include_tags        = true
+
+  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
+}
+
+# Monitoring App Services 404 errors percent
 resource "datadog_monitor" "appservices_http_404_errors_count" {
-  name    = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}"
+  name    = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}"
   type    = "metric alert"
   message = "${var.message}"
 
@@ -94,9 +124,9 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
   tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
-# Monitoring App Services HTTP 2xx status pages rate
+# Monitoring App Services HTTP 2xx status pages percent
 resource "datadog_monitor" "appservices_http_2xx_status_rate" {
-  name    = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}"
+  name    = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}"
   type    = "metric alert"
   message = "${var.message}"
 

From e3e3469cfbac1f36c0e8abfe5a8447145447bfbd Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 13:29:25 +0100
Subject: [PATCH 88/93] MON-74 Change 404 errors to 4xx

---
 cloud/azure/app-services/README.md                |  6 +++---
 cloud/azure/app-services/inputs.tf                | 10 +++++-----
 cloud/azure/app-services/monitors-app_services.tf | 14 +++++++-------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index b439492..fac9581 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -20,7 +20,7 @@ Creates a DataDog monitors with the following checks :
 * Response time
 * Memory usage count
 * HTTP 5xx requests
-* HTTP 404 requests
+* HTTP 4xx requests
 * HTTP 2xx requests
 
 Inputs
@@ -34,8 +34,8 @@ Inputs
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no |
 | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
-| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no |
-| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no |
+| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no |
+| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no |
 | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no |
 | http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index bc50156..3085251 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -65,17 +65,17 @@ variable "http_5xx_requests_threshold_warning" {
 }
 
 #################################
-###   HTTP 404 status pages   ###
+###   HTTP 4xx status pages   ###
 #################################
 
-variable "http_404_requests_threshold_critical" {
+variable "http_4xx_requests_threshold_critical" {
   default     = 30
-  description = "Maximum critical acceptable percent of 404 errors"
+  description = "Maximum critical acceptable percent of 4xx errors"
 }
 
-variable "http_404_requests_threshold_warning" {
+variable "http_4xx_requests_threshold_warning" {
   default     = 15
-  description = "Maximum warning acceptable percent of 404 errors"
+  description = "Maximum warning acceptable percent of 4xx errors"
 }
 
 #################################
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 3f8b49b..02cf2d9 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -94,25 +94,25 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
   tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 
-# Monitoring App Services 404 errors percent
-resource "datadog_monitor" "appservices_http_404_errors_count" {
-  name    = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}"
+# Monitoring App Services 4xx errors percent
+resource "datadog_monitor" "appservices_http_4xx_errors_count" {
+  name    = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}"
   type    = "metric alert"
   message = "${var.message}"
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.app_services.http404{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
+      avg:azure.app_services.http4xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
-    ) * 100 > ${var.http_404_requests_threshold_critical}
+    ) * 100 > ${var.http_4xx_requests_threshold_critical}
   EOF
 
   evaluation_delay = "${var.delay}"
   new_host_delay   = "${var.delay}"
 
   thresholds {
-    warning  = "${var.http_404_requests_threshold_warning}"
-    critical = "${var.http_404_requests_threshold_critical}"
+    warning  = "${var.http_4xx_requests_threshold_warning}"
+    critical = "${var.http_4xx_requests_threshold_critical}"
   }
 
   notify_no_data      = false # Will NOT notify when no data is received

From b2d807fa46495572e8981ae749fa6ca7ad854826 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 15:51:55 +0100
Subject: [PATCH 89/93] MON-74 update readme with new thresholds

---
 cloud/azure/app-services/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md
index fac9581..dc9e526 100644
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@@ -34,10 +34,10 @@ Inputs
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no |
 | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
-| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no |
-| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no |
-| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no |
-| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no |
+| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
+| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
+| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
+| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |

From 66747bda71b12563f8ffcc026a0dbc1bcaed4844 Mon Sep 17 00:00:00 2001
From: Quentin Manfroi <quentin.manfroi@fr.clara.net>
Date: Fri, 15 Dec 2017 15:59:56 +0100
Subject: [PATCH 90/93] MON-90 change avg to sum for all as_count queries

---
 cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
index a1a0457..2a23126 100644
--- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@@ -40,7 +40,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.failed_requests_threshold_critical}
@@ -70,7 +70,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.other_requests_threshold_critical}
@@ -100,7 +100,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.apimanagement_service.unauthorized_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) > ${var.unauthorized_requests_threshold_critical}
@@ -130,7 +130,7 @@ resource "datadog_monitor" "apimgt_successful_requests" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
       avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
       avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
     ) < ${var.successful_requests_threshold_critical}

From 22f4e97d501f2d99dbd0e6855ab032ace3587864 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Mon, 27 Nov 2017 17:13:45 +0100
Subject: [PATCH 91/93] MON-73 Azure managed services monitors base feature

---
 cloud/azure/README.md                   | 118 +++++++
 cloud/azure/inputs.tf                   | 395 ++++++++++++++++++++++++
 cloud/azure/iothubs/README.md           |   3 +-
 cloud/azure/iothubs/inputs.tf           |  11 +-
 cloud/azure/iothubs/monitors-iothubs.tf |  72 ++---
 cloud/azure/monitors.tf                 | 166 ++++++++++
 6 files changed, 715 insertions(+), 50 deletions(-)
 create mode 100644 cloud/azure/README.md
 create mode 100644 cloud/azure/inputs.tf
 create mode 100644 cloud/azure/monitors.tf

diff --git a/cloud/azure/README.md b/cloud/azure/README.md
new file mode 100644
index 0000000..5dcff20
--- /dev/null
+++ b/cloud/azure/README.md
@@ -0,0 +1,118 @@
+Azure monitors
+==============
+
+How to use this module
+----------------------
+
+```
+module "datadog-monitors-azure" {
+  source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure?ref={revision}"
+
+  message     = "${module.datadog-message-alerting.alerting-message}"
+  environment = "${var.environment}"
+}
+```
+
+Purpose
+-------
+Creates a set of Azure DataDog monitors for the following components :
+
+* Azure App Services monitors
+* Azure SQL monitors
+* Azure Redis monitors
+* Azure Event Hub monitors
+* Azure Stream Analytics monitors
+* Azure Storage monitors
+* Azure IOT Hub monitors
+* Azure API Management monitors
+
+Inputs
+------
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
+| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
+| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
+| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
+| appservices_http_2xx_status_rate_limit |  | string | `30` | no |
+| appservices_http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
+| appservices_http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
+| appservices_http_404_errors_count_rate_limit |  | string | `30` | no |
+| appservices_http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
+| appservices_http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
+| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
+| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
+| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
+| delay | Delay in seconds for the metric evaluation | string | `600` | no |
+| environment | Architecture environment | string | - | yes |
+| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
+| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
+| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
+| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
+| iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
+| iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
+| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
+| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
+| iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
+| iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
+| iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
+| iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
+| iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
+| iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
+| message | Message sent when a monitor is triggered | string | - | yes |
+| non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no |
+| redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
+| redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
+| redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
+| redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
+| redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
+| redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
+| sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
+| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no |
+| sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
+| sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
+| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
+| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
+| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
+| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no |
+| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no |
+| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no |
+| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no |
+| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no |
+| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no |
+| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no |
+| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no |
+| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no |
+| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
+| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
+| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
+| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
+| streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no |
+| streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no |
+| streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no |
+| streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no |
+
+Related documentation
+---------------------
+
+DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
+
+Azure metrics documentation: [https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics](https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics)
diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf
new file mode 100644
index 0000000..5526988
--- /dev/null
+++ b/cloud/azure/inputs.tf
@@ -0,0 +1,395 @@
+variable "environment" {
+  description = "Architecture environment"
+  type        = "string"
+}
+
+variable "message" {
+  description = "Message sent when a monitor is triggered"
+  type        = "string"
+}
+
+variable "delay" {
+  description = "Delay in seconds for the metric evaluation"
+  default     = 600
+}
+
+variable "filter_tags_use_defaults" {
+  description = "Use default filter tags convention"
+  default     = "true"
+}
+
+variable "filter_tags_custom" {
+  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
+  default     = "*"
+}
+
+variable "non_taggable_filter_tags" {
+  description = "Tags used for filtering for components without tag support"
+  default     = "*"
+}
+
+# Azure API Management specific variables
+variable "apimanagement_failed_requests_threshold_critical" {
+  description = "Maximum acceptable percent of failed requests"
+  default     = 5
+}
+
+variable "apimanagement_other_requests_threshold_critical" {
+  description = "Maximum acceptable percent of other requests"
+  default     = 5
+}
+
+variable "apimanagement_unauthorized_requests_threshold_critical" {
+  description = "Maximum acceptable percent of unauthorized requests"
+  default     = 5
+}
+
+variable "apimanagement_successful_requests_threshold_critical" {
+  description = "Minimum acceptable percent of successful requests"
+  default     = 90
+}
+
+# Azure App Services specific variables
+variable "appservices_response_time_threshold_critical" {
+  default     = 0.8
+  description = "Alerting threshold in seconds"
+}
+
+variable "appservices_response_time_threshold_warning" {
+  default     = 0.4
+  description = "Warning threshold in seconds"
+}
+
+variable "appservices_memory_usage_threshold_critical" {
+  default     = 52430000
+  description = "Alerting threshold in Mib"
+}
+
+variable "appservices_memory_usage_threshold_warning" {
+  default     = 33550000
+  description = "Warning threshold in MiB"
+}
+
+variable "appservices_http_404_errors_count_rate_limit" {
+  default = 30
+}
+
+variable "appservices_http_404_errors_count_rate_threshold_critical" {
+  default     = 30
+  description = "Alerting threshold (number of requests)"
+}
+
+variable "appservices_http_404_errors_count_rate_threshold_warning" {
+  default     = 10
+  description = "Warning threshold (number of requests)"
+}
+
+variable "appservices_http_2xx_status_rate_limit" {
+  default = 30
+}
+
+variable "appservices_http_2xx_status_rate_threshold_critical" {
+  default     = 0.9
+  description = "Alerting threshold (percentage)"
+}
+
+variable "appservices_http_2xx_status_rate_threshold_warning" {
+  default     = 0.95
+  description = "Warning threshold (percentage)"
+}
+
+# Azure Event Hub specific variables
+variable "eventhub_failed_requests_rate_thresold_critical" {
+  description = "Failed requests ratio (percentage) to trigger the critical alert"
+  default     = 3
+}
+
+variable "eventhub_failed_requests_rate_thresold_warning" {
+  description = "Failed requests ratio (percentage) to trigger a warning alert"
+  default     = 1
+}
+
+variable "eventhub_errors_rate_thresold_critical" {
+  description = "Errors ratio (percentage) to trigger the critical alert"
+  default     = 3
+}
+
+variable "eventhub_errors_rate_thresold_warning" {
+  description = "Errors ratio (percentage) to trigger a warning alert"
+  default     = 1
+}
+
+# IOT Hub specific variables
+variable "iothub_failed_jobs_rate_threshold_warning" {
+  description = "Jobs Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_jobs_rate_threshold_critical" {
+  description = "Jobs Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_listjobs_rate_threshold_warning" {
+  description = "ListJobs Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_listjobs_rate_threshold_critical" {
+  description = "ListJobs Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_queryjobs_rate_threshold_warning" {
+  description = "QueryJobs Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_queryjobs_rate_threshold_critical" {
+  description = "QueryJobs Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_c2d_methods_rate_threshold_warning" {
+  description = "C2D Methods Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_c2d_methods_rate_threshold_critical" {
+  description = "C2D Methods Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_c2d_twin_read_rate_threshold_warning" {
+  description = "C2D Twin Read Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_c2d_twin_read_rate_threshold_critical" {
+  description = "C2D Twin Read Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_c2d_twin_update_rate_threshold_warning" {
+  description = "C2D Twin Update Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_c2d_twin_update_rate_threshold_critical" {
+  description = "C2D Twin Update Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_d2c_twin_read_rate_threshold_warning" {
+  description = "D2C Twin Read Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_d2c_twin_read_rate_threshold_critical" {
+  description = "D2C Twin Read Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_failed_d2c_twin_update_rate_threshold_warning" {
+  description = "D2C Twin Update Failed rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "iothub_failed_d2c_twin_update_rate_threshold_critical" {
+  description = "D2C Twin Update Failed rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Dropped limit (warning threshold)"
+  default     = 500
+}
+
+variable "iothub_dropped_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Dropped limit (critical threshold)"
+  default     = 1000
+}
+
+variable "iothub_orphaned_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Orphaned limit (warning threshold)"
+  default     = 500
+}
+
+variable "iothub_orphaned_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Orphaned limit (critical threshold)"
+  default     = 1000
+}
+
+variable "iothub_invalid_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Invalid limit (warning threshold)"
+  default     = 500
+}
+
+variable "iothub_invalid_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Invalid limit (critical threshold)"
+  default     = 1000
+}
+
+variable "iothub_fallback_d2c_telemetry_egress_threshold_warning" {
+  description = "D2C Telemetry Fallback limit (warning threshold)"
+  default     = 500
+}
+
+variable "iothub_fallback_d2c_telemetry_egress_threshold_critical" {
+  description = "D2C Telemetry Fallback limit (critical threshold)"
+  default     = 1000
+}
+
+# Azure Redis specific variables
+variable "redis_evictedkeys_limit_threshold_warning" {
+  description = "Evicted keys limit (warning threshold)"
+  default     = 0
+}
+
+variable "redis_evictedkeys_limit_threshold_critical" {
+  description = "Evicted keys limit (critical threshold)"
+  default     = 100
+}
+
+variable "redis_percent_processor_time_threshold_critical" {
+  description = "Processor time percent (critical threshold)"
+  default     = 80
+}
+
+variable "redis_percent_processor_time_threshold_warning" {
+  description = "Processor time percent (warning threshold)"
+  default     = 60
+}
+
+variable "redis_server_load_rate_threshold_critical" {
+  description = "Server CPU load rate (critical threshold)"
+  default     = 90
+}
+
+variable "redis_server_load_rate_threshold_warning" {
+  description = "Server CPU load rate (warning threshold)"
+  default     = 70
+}
+
+# Azure SQL Database specific variables
+variable "sqldatabase_cpu_threshold_warning" {
+  description = "CPU usage in percent (warning threshold)"
+  default     = ""
+}
+
+variable "sqldatabase_cpu_threshold_critical" {
+  description = "CPU usage in percent (critical threshold)"
+  default     = "90"
+}
+
+variable "sqldatabase_diskspace_threshold_warning" {
+  description = "Disk space used in percent (warning threshold)"
+  default     = "80"
+}
+
+variable "sqldatabase_diskspace_threshold_critical" {
+  description = "Disk space used in percent (critical threshold)"
+  default     = "90"
+}
+
+variable "sqldatabase_dtu_threshold_warning" {
+  description = "Amount of DTU used (warning threshold)"
+  default     = "85"
+}
+
+variable "sqldatabase_dtu_threshold_critical" {
+  description = "Amount of DTU used (critical threshold)"
+  default     = "90"
+}
+
+variable "sqldatabase_deadlock_threshold_critical" {
+  description = "Amount of Deadlocks (critical threshold)"
+  default     = "1"
+}
+
+# Azure Storage specific variables
+variable "storage_availability_threshold_critical" {
+  description = "Minimum acceptable percent of availability for a storage"
+  default     = 90
+}
+
+variable "storage_successful_requests_threshold_critical" {
+  description = "Minimum acceptable percent of successful requests for a storage"
+  default     = 90
+}
+
+variable "storage_latency_threshold_critical" {
+  description = "Maximum acceptable end to end latency (ms) for a storage"
+  default     = 1000
+}
+
+variable "storage_timeout_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of timeout error requests for a storage"
+  default     = 5
+}
+
+variable "storage_network_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of network error requests for a storage"
+  default     = 5
+}
+
+variable "storage_throttling_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of throttling error requests for a storage"
+  default     = 10
+}
+
+variable "storage_server_other_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of server other error requests for a storage"
+  default     = 10
+}
+
+variable "storage_client_other_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of client other error requests for a storage"
+  default     = 15
+}
+
+variable "storage_authorization_error_requests_threshold_critical" {
+  description = "Maximum acceptable percent of authorization error requests for a storage"
+  default     = 15
+}
+
+# Azure Stream Analytics specific variables
+variable "streamanalytics_su_utilization_threshold_warning" {
+  description = "Streaming Unit utilization rate limit (warning threshold)"
+  default     = 60
+}
+
+variable "streamanalytics_su_utilization_threshold_critical" {
+  description = "Streaming Unit utilization rate limit (critical threshold)"
+  default     = 80
+}
+
+variable "streamanalytics_function_requests_threshold_warning" {
+  description = "Failed Function Request rate limit (warning threshold)"
+  default     = 0
+}
+
+variable "streamanalytics_failed_function_requests_threshold_critical" {
+  description = "Failed Function Request rate limit (critical threshold)"
+  default     = 10
+}
+
+variable "streamanalytics_conversion_errors_threshold_warning" {
+  description = "Conversion errors limit (warning threshold)"
+  default     = 0
+}
+
+variable "streamanalytics_conversion_errors_threshold_critical" {
+  description = "Conversion errors limit (critical threshold)"
+  default     = 10
+}
+
+variable "streamanalytics_runtime_errors_threshold_warning" {
+  description = "Runtime errors limit (warning threshold)"
+  default     = 0
+}
+
+variable "streamanalytics_runtime_errors_threshold_critical" {
+  description = "Runtime errors limit (critical threshold)"
+  default     = 10
+}
diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md
index 5187715..e594a65 100644
--- a/cloud/azure/iothubs/README.md
+++ b/cloud/azure/iothubs/README.md
@@ -60,8 +60,7 @@ Inputs
 | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
 | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
 | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
-| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
-| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
+| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
 | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
 | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
 | message | Message sent when an alert is triggered | string | - | yes |
diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf
index 1b1348f..68c9965 100644
--- a/cloud/azure/iothubs/inputs.tf
+++ b/cloud/azure/iothubs/inputs.tf
@@ -14,14 +14,9 @@ variable "message" {
   description = "Message sent when an alert is triggered"
 }
 
-variable "filter_tags_use_defaults" {
-  description = "Use default filter tags convention"
-  default     = "true"
-}
-
-variable "filter_tags_custom" {
-  description = "Tags used for custom filtering when filter_tags_use_defaults is false"
-  default     = "*"
+variable "filter_tags" {
+  description = "Tags used for filtering"
+  default = "*"
 }
 
 # Azure IOT hubs specific
diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf
index 9388f1c..5040c58 100644
--- a/cloud/azure/iothubs/monitors-iothubs.tf
+++ b/cloud/azure/iothubs/monitors-iothubs.tf
@@ -1,20 +1,12 @@
-data "template_file" "filter" {
-  template = "$${filter}"
-
-  vars {
-    filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
-  }
-}
-
 resource "datadog_monitor" "too_many_jobs_failed" {
   name    = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
   message = "${var.message}"
 
   query = <<EOF
           sum(last_5m):(
-            avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-            ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-                avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+            avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+            ( avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+                avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_count() )
           ) * 100 > ${var.failed_jobs_rate_threshold_critical}
   EOF
 
@@ -45,9 +37,9 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
 
   query = <<EOF
           sum(last_5m):(
-            avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
-              ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
-                  avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
+            avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() /
+              ( avg:azure.devices_iothubs.jobs.list_jobs.success{${var.filter_tags}} by {resource_group,name}.as_count() +
+                  avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() )
           ) * 100 > ${var.failed_listjobs_rate_threshold_critical}
   EOF
 
@@ -78,9 +70,9 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
-        ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
-            avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
+      avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() /
+        ( avg:azure.devices_iothubs.jobs.query_jobs.success{${var.filter_tags}} by {resource_group,name}.as_count() +
+            avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() )
     ) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
   EOF
 
@@ -110,7 +102,7 @@ resource "datadog_monitor" "status" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
+    avg(last_5m):avg:azure.devices_iothubs.status{${var.filter_tags}} by {resource_group,region,name} < 1
   EOF
 
   type = "metric alert"
@@ -134,7 +126,7 @@ resource "datadog_monitor" "total_devices" {
   message = "${var.message}"
 
   query = <<EOF
-    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {resource_group,region,name} == 0
+    avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${var.filter_tags}} by {resource_group,region,name} == 0
   EOF
 
   type = "metric alert"
@@ -159,9 +151,9 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-            avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+      avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+            avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
   EOF
 
@@ -192,9 +184,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-            avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+            avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
   EOF
 
@@ -225,9 +217,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-      ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-          avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+      avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+      ( avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
   EOF
 
@@ -258,9 +250,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
   EOF
 
@@ -291,9 +283,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
 
   query = <<EOF
     sum(last_5m):(
-      avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() +
-          avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() )
+      avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
+        ( avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
+          avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
     ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
   EOF
 
@@ -324,7 +316,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
 
   query = <<EOF
       sum(last_5m): (
-        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+        avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_count()
       ) > ${var.dropped_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -355,7 +347,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_count()
     ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -386,7 +378,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_count()
     ) > ${var.invalid_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -417,7 +409,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${var.filter_tags}} by {resource_group,region,name}.as_count()
     )  > ${var.fallback_d2c_telemetry_egress_threshold_critical}
   EOF
 
@@ -448,8 +440,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
 
   query = <<EOF
     sum(last_5m): (
-      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() -
-        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
+      avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${var.filter_tags}} by {resource_group,region,name}.as_count() -
+        avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${var.filter_tags}} by {resource_group,region,name}.as_count()
     ) > 0
   EOF
 
diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf
new file mode 100644
index 0000000..356d6c9
--- /dev/null
+++ b/cloud/azure/monitors.tf
@@ -0,0 +1,166 @@
+module "apimanagement" {
+  source = "./apimanagement"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  failed_requests_threshold_critical       = "${var.apimanagement_failed_requests_threshold_critical}"
+  other_requests_threshold_critical        = "${var.apimanagement_other_requests_threshold_critical}"
+  successful_requests_threshold_critical   = "${var.apimanagement_successful_requests_threshold_critical}"
+  unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}"
+}
+
+module "appservices" {
+  source = "./app-services"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  http_2xx_status_rate_limit                    = "${var.appservices_http_2xx_status_rate_limit}"
+  http_2xx_status_rate_threshold_critical       = "${var.appservices_http_2xx_status_rate_threshold_critical}"
+  http_2xx_status_rate_threshold_warning        = "${var.appservices_http_2xx_status_rate_threshold_warning}"
+  http_404_errors_count_rate_limit              = "${var.appservices_http_404_errors_count_rate_limit}"
+  http_404_errors_count_rate_threshold_critical = "${var.appservices_http_404_errors_count_rate_threshold_critical}"
+  http_404_errors_count_rate_threshold_warning  = "${var.appservices_http_404_errors_count_rate_threshold_warning}"
+  memory_usage_threshold_critical               = "${var.appservices_memory_usage_threshold_critical}"
+  memory_usage_threshold_warning                = "${var.appservices_memory_usage_threshold_warning}"
+  response_time_threshold_critical              = "${var.appservices_response_time_threshold_critical}"
+  response_time_threshold_warning               = "${var.appservices_response_time_threshold_warning}"
+}
+
+module "eventhub" {
+  source = "./eventhub"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  errors_rate_thresold_critical          = "${var.eventhub_errors_rate_thresold_critical}"
+  errors_rate_thresold_warning           = "${var.eventhub_errors_rate_thresold_warning}"
+  failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}"
+  failed_requests_rate_thresold_warning  = "${var.eventhub_failed_requests_rate_thresold_warning}"
+}
+
+module "iothub" {
+  source = "./iothubs"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags = "${var.non_taggable_filter_tags}"
+
+  dropped_d2c_telemetry_egress_threshold_critical  = "${var.iothub_dropped_d2c_telemetry_egress_threshold_critical}"
+  dropped_d2c_telemetry_egress_threshold_warning   = "${var.iothub_dropped_d2c_telemetry_egress_threshold_warning}"
+  failed_c2d_methods_rate_threshold_critical       = "${var.iothub_failed_c2d_methods_rate_threshold_critical}"
+  failed_c2d_methods_rate_threshold_warning        = "${var.iothub_failed_c2d_methods_rate_threshold_warning}"
+  failed_c2d_twin_read_rate_threshold_critical     = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}"
+  failed_c2d_twin_read_rate_threshold_warning      = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}"
+  failed_c2d_twin_update_rate_threshold_critical   = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}"
+  failed_c2d_twin_update_rate_threshold_warning    = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}"
+  failed_d2c_twin_read_rate_threshold_critical     = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}"
+  failed_d2c_twin_read_rate_threshold_warning      = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}"
+  failed_d2c_twin_update_rate_threshold_critical   = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}"
+  failed_d2c_twin_update_rate_threshold_warning    = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}"
+  failed_jobs_rate_threshold_critical              = "${var.iothub_failed_jobs_rate_threshold_critical}"
+  failed_jobs_rate_threshold_warning               = "${var.iothub_failed_jobs_rate_threshold_warning}"
+  failed_listjobs_rate_threshold_critical          = "${var.iothub_failed_listjobs_rate_threshold_critical}"
+  failed_listjobs_rate_threshold_warning           = "${var.iothub_failed_listjobs_rate_threshold_warning}"
+  failed_queryjobs_rate_threshold_critical         = "${var.iothub_failed_queryjobs_rate_threshold_critical}"
+  failed_queryjobs_rate_threshold_warning          = "${var.iothub_failed_queryjobs_rate_threshold_warning}"
+  fallback_d2c_telemetry_egress_threshold_critical = "${var.iothub_fallback_d2c_telemetry_egress_threshold_critical}"
+  fallback_d2c_telemetry_egress_threshold_warning  = "${var.iothub_fallback_d2c_telemetry_egress_threshold_warning}"
+  invalid_d2c_telemetry_egress_threshold_critical  = "${var.iothub_invalid_d2c_telemetry_egress_threshold_critical}"
+  invalid_d2c_telemetry_egress_threshold_warning   = "${var.iothub_invalid_d2c_telemetry_egress_threshold_warning}"
+  orphaned_d2c_telemetry_egress_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_critical}"
+  orphaned_d2c_telemetry_egress_threshold_warning  = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_warning}"
+}
+
+module "redis" {
+  source = "./redis"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  evictedkeys_limit_threshold_critical      = "${var.redis_evictedkeys_limit_threshold_critical}"
+  evictedkeys_limit_threshold_warning       = "${var.redis_evictedkeys_limit_threshold_warning}"
+  percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}"
+  percent_processor_time_threshold_warning  = "${var.redis_percent_processor_time_threshold_warning}"
+  server_load_rate_threshold_critical       = "${var.redis_server_load_rate_threshold_critical}"
+  server_load_rate_threshold_warning        = "${var.redis_server_load_rate_threshold_warning}"
+}
+
+module "sqldatabase" {
+  source = "./sql-database"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  cpu_threshold_critical       = "${var.sqldatabase_cpu_threshold_critical}"
+  cpu_threshold_warning        = "${var.sqldatabase_cpu_threshold_warning}"
+  deadlock_threshold_critical  = "${var.sqldatabase_deadlock_threshold_critical}"
+  diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}"
+  diskspace_threshold_warning  = "${var.sqldatabase_diskspace_threshold_warning}"
+  dtu_threshold_critical       = "${var.sqldatabase_dtu_threshold_critical}"
+  dtu_threshold_warning        = "${var.sqldatabase_dtu_threshold_warning}"
+}
+
+module "storage" {
+  source = "./storage"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}"
+  availability_threshold_critical                 = "${var.storage_availability_threshold_critical}"
+  client_other_error_requests_threshold_critical  = "${var.storage_client_other_error_requests_threshold_critical}"
+  latency_threshold_critical                      = "${var.storage_latency_threshold_critical}"
+  network_error_requests_threshold_critical       = "${var.storage_network_error_requests_threshold_critical}"
+  server_other_error_requests_threshold_critical  = "${var.storage_server_other_error_requests_threshold_critical}"
+  successful_requests_threshold_critical          = "${var.storage_successful_requests_threshold_critical}"
+  throttling_error_requests_threshold_critical    = "${var.storage_throttling_error_requests_threshold_critical}"
+  timeout_error_requests_threshold_critical       = "${var.storage_timeout_error_requests_threshold_critical}"
+}
+
+module "streamanalytics" {
+  source = "./stream-analytics"
+
+  environment = "${var.environment}"
+  message     = "${var.message}"
+  delay       = "${var.delay}"
+
+  filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
+  filter_tags_custom       = "${var.filter_tags_custom}"
+
+  conversion_errors_threshold_critical        = "${var.streamanalytics_conversion_errors_threshold_critical}"
+  conversion_errors_threshold_warning         = "${var.streamanalytics_conversion_errors_threshold_warning}"
+  failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}"
+  function_requests_threshold_warning         = "${var.streamanalytics_function_requests_threshold_warning}"
+  runtime_errors_threshold_critical           = "${var.streamanalytics_runtime_errors_threshold_critical}"
+  runtime_errors_threshold_warning            = "${var.streamanalytics_runtime_errors_threshold_warning}"
+  su_utilization_threshold_critical           = "${var.streamanalytics_su_utilization_threshold_critical}"
+  su_utilization_threshold_warning            = "${var.streamanalytics_su_utilization_threshold_warning}"
+}

From 2680f12280644c369c322246c96dbf8933325247 Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Fri, 15 Dec 2017 17:04:33 +0100
Subject: [PATCH 92/93] MON-73 Update app-services monitors input mapping

---
 cloud/azure/README.md   | 12 ++++++------
 cloud/azure/inputs.tf   | 38 ++++++++++++++++++++------------------
 cloud/azure/monitors.tf | 20 ++++++++++----------
 3 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/cloud/azure/README.md b/cloud/azure/README.md
index 5dcff20..124fd89 100644
--- a/cloud/azure/README.md
+++ b/cloud/azure/README.md
@@ -35,12 +35,12 @@ Inputs
 | apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
 | apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
 | apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
-| appservices_http_2xx_status_rate_limit |  | string | `30` | no |
-| appservices_http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
-| appservices_http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
-| appservices_http_404_errors_count_rate_limit |  | string | `30` | no |
-| appservices_http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
-| appservices_http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| appservices_http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no |
+| appservices_http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
+| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
+| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
+| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
+| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
 | appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
 | appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf
index 5526988..cf4510a 100644
--- a/cloud/azure/inputs.tf
+++ b/cloud/azure/inputs.tf
@@ -70,32 +70,34 @@ variable "appservices_memory_usage_threshold_warning" {
   description = "Warning threshold in MiB"
 }
 
-variable "appservices_http_404_errors_count_rate_limit" {
-  default = 30
-}
-
-variable "appservices_http_404_errors_count_rate_threshold_critical" {
+variable "appservices_http_4xx_requests_threshold_critical" {
   default     = 30
-  description = "Alerting threshold (number of requests)"
+  description = "Maximum critical acceptable percent of 4xx errors"
 }
 
-variable "appservices_http_404_errors_count_rate_threshold_warning" {
+variable "appservices_http_4xx_requests_threshold_warning" {
+  default     = 15
+  description = "Maximum warning acceptable percent of 4xx errors"
+}
+
+variable "appservices_http_5xx_requests_threshold_critical" {
+  default     = 20
+  description = "Maximum critical acceptable percent of 5xx errors"
+}
+
+variable "appservices_http_5xx_requests_threshold_warning" {
   default     = 10
-  description = "Warning threshold (number of requests)"
+  description = "Maximum warning acceptable percent of 5xx errors"
 }
 
-variable "appservices_http_2xx_status_rate_limit" {
-  default = 30
+variable "appservices_http_2xx_requests_threshold_critical" {
+  default     = 90
+  description = "Minimum critical acceptable percent of 2xx requests"
 }
 
-variable "appservices_http_2xx_status_rate_threshold_critical" {
-  default     = 0.9
-  description = "Alerting threshold (percentage)"
-}
-
-variable "appservices_http_2xx_status_rate_threshold_warning" {
-  default     = 0.95
-  description = "Warning threshold (percentage)"
+variable "appservices_http_2xx_requests_threshold_warning" {
+  default     = 95
+  description = "Minimum warning acceptable percent of 2xx requests"
 }
 
 # Azure Event Hub specific variables
diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf
index 356d6c9..fc9aeee 100644
--- a/cloud/azure/monitors.tf
+++ b/cloud/azure/monitors.tf
@@ -24,16 +24,16 @@ module "appservices" {
   filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
   filter_tags_custom       = "${var.filter_tags_custom}"
 
-  http_2xx_status_rate_limit                    = "${var.appservices_http_2xx_status_rate_limit}"
-  http_2xx_status_rate_threshold_critical       = "${var.appservices_http_2xx_status_rate_threshold_critical}"
-  http_2xx_status_rate_threshold_warning        = "${var.appservices_http_2xx_status_rate_threshold_warning}"
-  http_404_errors_count_rate_limit              = "${var.appservices_http_404_errors_count_rate_limit}"
-  http_404_errors_count_rate_threshold_critical = "${var.appservices_http_404_errors_count_rate_threshold_critical}"
-  http_404_errors_count_rate_threshold_warning  = "${var.appservices_http_404_errors_count_rate_threshold_warning}"
-  memory_usage_threshold_critical               = "${var.appservices_memory_usage_threshold_critical}"
-  memory_usage_threshold_warning                = "${var.appservices_memory_usage_threshold_warning}"
-  response_time_threshold_critical              = "${var.appservices_response_time_threshold_critical}"
-  response_time_threshold_warning               = "${var.appservices_response_time_threshold_warning}"
+  http_2xx_requests_threshold_critical = "${var.appservices_http_2xx_requests_threshold_critical}"
+  http_2xx_requests_threshold_warning  = "${var.appservices_http_2xx_requests_threshold_warning}"
+  http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}"
+  http_5xx_requests_threshold_warning  = "${var.appservices_http_5xx_requests_threshold_warning}"
+  http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}"
+  http_4xx_requests_threshold_warning  = "${var.appservices_http_4xx_requests_threshold_warning}"
+  memory_usage_threshold_critical      = "${var.appservices_memory_usage_threshold_critical}"
+  memory_usage_threshold_warning       = "${var.appservices_memory_usage_threshold_warning}"
+  response_time_threshold_critical     = "${var.appservices_response_time_threshold_critical}"
+  response_time_threshold_warning      = "${var.appservices_response_time_threshold_warning}"
 }
 
 module "eventhub" {

From a3f7795ceb0120bb1e75e911773c686ec24968bc Mon Sep 17 00:00:00 2001
From: Laurent Piroelle <laurent.piroelle@fr.clara.net>
Date: Fri, 15 Dec 2017 17:27:39 +0100
Subject: [PATCH 93/93] MON-73 Fix some default values and README files

---
 cloud/azure/README.md                  |  2 +-
 cloud/azure/inputs.tf                  |  2 +-
 cloud/azure/sql-database/README.md     |  2 +-
 cloud/azure/sql-database/inputs.tf     |  2 +-
 cloud/azure/storage/README.md          | 22 +++++++++++-----------
 cloud/azure/stream-analytics/README.md | 18 ++++++++----------
 6 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/cloud/azure/README.md b/cloud/azure/README.md
index 124fd89..5d0cac8 100644
--- a/cloud/azure/README.md
+++ b/cloud/azure/README.md
@@ -86,7 +86,7 @@ Inputs
 | redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
 | redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
 | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
-| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no |
+| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
 | sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
 | sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
 | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf
index cf4510a..775fc3e 100644
--- a/cloud/azure/inputs.tf
+++ b/cloud/azure/inputs.tf
@@ -276,7 +276,7 @@ variable "redis_server_load_rate_threshold_warning" {
 # Azure SQL Database specific variables
 variable "sqldatabase_cpu_threshold_warning" {
   description = "CPU usage in percent (warning threshold)"
-  default     = ""
+  default     = "80"
 }
 
 variable "sqldatabase_cpu_threshold_critical" {
diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md
index 7d815e3..8f42bde 100644
--- a/cloud/azure/sql-database/README.md
+++ b/cloud/azure/sql-database/README.md
@@ -28,7 +28,7 @@ Inputs
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
 | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
-| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no |
+| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
 | deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf
index 9ddab06..aa81cfb 100644
--- a/cloud/azure/sql-database/inputs.tf
+++ b/cloud/azure/sql-database/inputs.tf
@@ -28,7 +28,7 @@ variable "filter_tags_custom" {
 
 variable "cpu_threshold_warning" {
   description = "CPU usage in percent (warning threshold)"
-  default     = ""
+  default     = "80"
 }
 
 variable "cpu_threshold_critical" {
diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md
index 0849152..7702683 100644
--- a/cloud/azure/storage/README.md
+++ b/cloud/azure/storage/README.md
@@ -32,20 +32,20 @@ Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|:----:|:-----:|:-----:|
+| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no |
+| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no |
+| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| message | Message sent when a monitor is triggered | string | - | yes |
-| filter_tags_use_defaults | Use default tagging convention | string | `true` | no |
 | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
-| availability_threshold_critical | Minimum threshold of availability | string | `90` | no |
-| successful_requests_threshold_critical | Minimum threshold of successful requests | string | `90` | no |
-| latency_threshold_critical | Maximum threshold of latency in ms | string | `1000` | no |
-| timeout_error_requests_threshold_critical | Maximum threshold of timeout error requests in percent | string | `35` | no |
-| network_error_requests_threshold_critical | Maximum threshold of network error requests in percent | string | `35` | no |
-| throttling_error_requests_threshold_critical | Maximum threshold of throttling error requests in percent | string | `50` | no |
-| server_other_error_requests_threshold_critical | Maximum threshold of server other error requests in percent | string | `50` | no |
-| client_other_error_requests_threshold_critical | Maximum threshold of client other error requests in percent | string | `75` | no |
-| authorization_error_requests_threshold_critical | Maximum threshold of authorization error requests in percent | string | `75` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
+| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no |
+| message | Message sent when a Redis monitor is triggered | string | - | yes |
+| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no |
+| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no |
+| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no |
+| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no |
+| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no |
 
 Related documentation
 ---------------------
diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md
index dca299b..53422c8 100644
--- a/cloud/azure/stream-analytics/README.md
+++ b/cloud/azure/stream-analytics/README.md
@@ -23,17 +23,15 @@ Inputs
 | conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
 | delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | environment | Architecture environment | string | - | yes |
-| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
+| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
+| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
+| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
 | function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
-| message | Message sent when a monitor is triggered | string | - | yes |
-| provider | What is the monitored provider | string | azure | no |
-| runtime_errors_threshold_critical |  | string | `10` | no |
-| runtime_errors_threshold_warning |  | string | `0` | no |
-| su_utilization_threshold_critical |  | string | `80` | no |
-| su_utilization_threshold_warning | Monitor specific | string | `60` | no |
-| service | What is the monitored service | string | storage | no |
-| subscription_id | Azure account id used as filter for monitors | string | - | yes |
-| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
+| message | Message sent when a Redis monitor is triggered | string | - | yes |
+| runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no |
+| runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no |
+| su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no |
+| su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no |
 
 Related documentation
 ---------------------