MON-80 Add monitors and update variables
This commit is contained in:
parent
7f0a0e91cf
commit
4c474be541
@ -1,38 +1,54 @@
|
|||||||
variable "critical_escalation_group" {}
|
|
||||||
|
|
||||||
variable "warning_escalation_group" {}
|
|
||||||
|
|
||||||
variable "environment" {}
|
variable "environment" {}
|
||||||
|
|
||||||
variable "stack" {}
|
variable "stack" {}
|
||||||
|
|
||||||
variable "client_name" {}
|
variable "client_name" {}
|
||||||
|
|
||||||
## IOT hubs
|
|
||||||
variable "delay" {
|
variable "delay" {
|
||||||
default = 600
|
default = 600
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "warning_jobs_failed" {
|
## IOT hubs
|
||||||
default = 5
|
variable "jobs_failed_threshold_warning" {
|
||||||
|
default = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "critical_jobs_failed" {
|
variable "jobs_failed_threshold_critical" {
|
||||||
default = 10
|
default = 10
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "warning_listjobs_failed" {
|
variable "jobs_failed_message" {}
|
||||||
default = 5
|
|
||||||
|
variable "listjobs_failed_threshold_warning" {
|
||||||
|
default = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "critical_listjobs_failed" {
|
variable "listjobs_failed_threshold_critical" {
|
||||||
default = 10
|
default = 10
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "warning_queryjobs_failed" {
|
variable "listjobs_failed_message" {}
|
||||||
default = 5
|
|
||||||
|
variable "queryjobs_failed_threshold_warning" {
|
||||||
|
default = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "critical_queryjobs_failed" {
|
variable "queryjobs_failed_threshold_critical" {
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "queryjobs_failed_message" {}
|
||||||
|
|
||||||
|
variable "status_message" {}
|
||||||
|
|
||||||
|
variable "total_devices_message" {}
|
||||||
|
|
||||||
|
variable "c2d_methods_failed_message" {}
|
||||||
|
|
||||||
|
variable "c2d_methods_failed_threshold_warning" {
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "c2d_methods_failed_threshold_critical" {
|
||||||
default = 10
|
default = 10
|
||||||
}
|
}
|
||||||
@ -1,13 +1,13 @@
|
|||||||
resource "datadog_monitor" "too_many_jobs_failed" {
|
resource "datadog_monitor" "too_many_jobs_failed" {
|
||||||
name = "[${var.environment}] Too many jobs failed on {{name}} "
|
name = "[${var.environment}] Too many jobs failed on {{name}} "
|
||||||
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
message = "${var.jobs_failed_message}"
|
||||||
|
|
||||||
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}"
|
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.warning_jobs_failed}"
|
warning = "${var.jobs_failed_threshold_warning}"
|
||||||
critical = "${var.critical_jobs_failed}"
|
critical = "${var.jobs_failed_threshold_critical}"
|
||||||
}
|
}
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||||
name = "[${var.environment}] Too many list_jobs failure on {{name}} "
|
name = "[${var.environment}] Too many list_jobs failure on {{name}} "
|
||||||
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
message = "${var.listjobs_failed_message}"
|
||||||
|
|
||||||
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}"
|
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.warning_listjobs_failed}"
|
warning = "${var.listjobs_failed_threshold_warning}"
|
||||||
critical = "${var.critical_listjobs_failed}"
|
critical = "${var.listjobs_failed_threshold_critical}"
|
||||||
}
|
}
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||||
name = "[${var.environment}] Too many query_jobs failed on {{name}} "
|
name = "[${var.environment}] Too many query_jobs failed on {{name}} "
|
||||||
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
message = "${var.queryjobs_failed_message}"
|
||||||
|
|
||||||
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}"
|
query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.warning_queryjobs_failed}"
|
warning = "${var.queryjobs_failed_threshold_warning}"
|
||||||
critical = "${var.critical_queryjobs_failed}"
|
critical = "${var.queryjobs_failed_threshold_warning}"
|
||||||
}
|
}
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "status" {
|
resource "datadog_monitor" "status" {
|
||||||
name = "[${var.environment}] Status is not ok on {{name}} "
|
name = "[${var.environment}] Status is not ok on {{name}} "
|
||||||
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
message = "${var.status_message}"
|
||||||
|
|
||||||
query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
|
query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "total_devices" {
|
||||||
|
name = "[${var.environment}] Total devices is wrong on {{name}} "
|
||||||
|
message = "${var.total_devices_message}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||||
|
name = "[${var.environment}] Too many c2d methods failure on {{name}} "
|
||||||
|
message = "${var.c2d_methods_failed_message}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.c2d_methods_failed_threshold_warning}"
|
||||||
|
critical = "${var.c2d_methods_failed_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
evaluation_delay = "${var.delay}"
|
evaluation_delay = "${var.delay}"
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user