MON-78 Add datadog monitor for stream analytics
This commit is contained in:
parent
ac6e0d69e1
commit
e4e929ec1d
44
cloud/azure/stream-analytics/inputs.tf
Normal file
44
cloud/azure/stream-analytics/inputs.tf
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
variable "hno_escalation_group" {}
|
||||||
|
variable "ho_escalation_group" {}
|
||||||
|
|
||||||
|
variable "environment" {}
|
||||||
|
|
||||||
|
variable "notify_no_data" {
|
||||||
|
default = "false"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "delay" {
|
||||||
|
default = "600"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "su_utilization_warning" {
|
||||||
|
default = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "su_utilization_critical" {
|
||||||
|
default = 80
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_function_requests_warning" {
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_function_requests_critical" {
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "conversion_errors_warning" {
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "conversion_errors_critical" {
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "runtime_errors_warning" {
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "runtime_errors_critical" {
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
92
cloud/azure/stream-analytics/monitors-stream-analytics.tf
Normal file
92
cloud/azure/stream-analytics/monitors-stream-analytics.tf
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
resource "datadog_monitor" "SU_utilization" {
|
||||||
|
name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]"
|
||||||
|
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = "${var.notify_no_data}"
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.su_utilization_warning}"
|
||||||
|
critical = "${var.su_utilization_critical}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "failed_function_requests" {
|
||||||
|
name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]"
|
||||||
|
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = "${var.notify_no_data}"
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_function_requests_warning}"
|
||||||
|
critical = "${var.failed_function_requests_critical}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "conversion_errors" {
|
||||||
|
name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]"
|
||||||
|
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = "${var.notify_no_data}"
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.conversion_errors_warning}"
|
||||||
|
critical = "${var.conversion_errors_critical}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "runtime_errors" {
|
||||||
|
name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]"
|
||||||
|
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
|
query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
notify_no_data = "${var.notify_no_data}"
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.runtime_errors_warning}"
|
||||||
|
critical = "${var.runtime_errors_critical}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user