MON-78 Normalize monitors & add status monitor
This commit is contained in:
parent
31f033c35d
commit
8afae8b5f4
@ -4,27 +4,6 @@ variable "environment" {
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "subscription_id" {
|
||||
description = "Azure account id used as filter for monitors"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "provider" {
|
||||
description = "Cloud provider which the monitor and its based metric depend on"
|
||||
type = "string"
|
||||
default = "azure"
|
||||
}
|
||||
|
||||
variable "service" {
|
||||
description = "Service monitored by this set of monitors"
|
||||
type = "string"
|
||||
default = "storage"
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "message" {
|
||||
description = "Message sent when a Redis monitor is triggered"
|
||||
@ -35,11 +14,16 @@ variable "delay" {
|
||||
default = 600
|
||||
}
|
||||
|
||||
variable "use_filter_tags" {
|
||||
description = "Filter the data with service tags if true"
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
# Azure Stream Analytics specific
|
||||
variable "su_utilization_threshold_warning" {
|
||||
description = "Streaming Unit utilization rate limit (warning threshold)"
|
||||
@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" {
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "function_requests_threshold_critical" {
|
||||
variable "failed_function_requests_threshold_critical" {
|
||||
description = "Failed Function Request rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
@ -2,12 +2,35 @@ data "template_file" "filter" {
|
||||
template = "$${filter}"
|
||||
|
||||
vars {
|
||||
filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
|
||||
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||
}
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "status" {
|
||||
name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
|
||||
EOF
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "su_utilization" {
|
||||
name = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -15,11 +38,11 @@ resource "datadog_monitor" "su_utilization" {
|
||||
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||
) > ${var.su_utilization_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = "${var.notify_no_data}"
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 60
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" {
|
||||
critical = "${var.su_utilization_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "failed_function_requests" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||
avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||
) * 100 > ${var.function_requests_threshold_critical}
|
||||
) * 100 > ${var.failed_function_requests_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = "${var.notify_no_data}"
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 60
|
||||
notify_audit = false
|
||||
@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" {
|
||||
no_data_timeframe = 20
|
||||
thresholds {
|
||||
warning = "${var.function_requests_threshold_warning}"
|
||||
critical = "${var.function_requests_threshold_critical}"
|
||||
critical = "${var.failed_function_requests_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "conversion_errors" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
|
||||
# Hard Coded Message while we don't know how to configure warning and critical thresholds
|
||||
message = "@FR-CloudPublic-run@fr.clara.net"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||
) > ${var.conversion_errors_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = "${var.notify_no_data}"
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 60
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" {
|
||||
critical = "${var.conversion_errors_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "runtime_errors" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
|
||||
# Hard Coded Message while we don't know how to configure warning and critical thresholds
|
||||
message = "@FR-CloudPublic-run@fr.clara.net"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||
) > ${var.runtime_errors_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = "${var.notify_no_data}"
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 60
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" {
|
||||
critical = "${var.runtime_errors_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user