MON-80 Rename variable for message alerting

This commit is contained in:
Alexandre Gaillet 2017-10-30 15:21:40 +01:00
parent daabb7244a
commit 7f0a0e91cf
2 changed files with 9 additions and 7 deletions

View File

@ -1,10 +1,12 @@
variable "hno_escalation_group" {} variable "critical_escalation_group" {}
variable "ho_escalation_group" {} variable "warning_escalation_group" {}
variable "environment" {} variable "environment" {}
variable "subscription_id" {} variable "stack" {}
variable "client_name" {}
## IOT hubs ## IOT hubs
variable "delay" { variable "delay" {

View File

@ -1,6 +1,6 @@
resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_jobs_failed" {
name = "[${var.environment}] Too many jobs failed on {{name}} " name = "[${var.environment}] Too many jobs failed on {{name}} "
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
type = "query alert" type = "query alert"
@ -24,7 +24,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" {
name = "[${var.environment}] Too many list_jobs failure on {{name}} " name = "[${var.environment}] Too many list_jobs failure on {{name}} "
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
type = "query alert" type = "query alert"
@ -48,7 +48,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" {
name = "[${var.environment}] Too many query_jobs failed on {{name}} " name = "[${var.environment}] Too many query_jobs failed on {{name}} "
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
type = "query alert" type = "query alert"
@ -72,7 +72,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
resource "datadog_monitor" "status" { resource "datadog_monitor" "status" {
name = "[${var.environment}] Status is not ok on {{name}} " name = "[${var.environment}] Status is not ok on {{name}} "
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
type = "query alert" type = "query alert"