MON-78 Normalize monitors & add status monitor

This commit is contained in:
Laurent Piroelle 2017-11-23 15:51:23 +01:00
parent 31f033c35d
commit 8afae8b5f4
2 changed files with 53 additions and 48 deletions

View File

@ -4,27 +4,6 @@ variable "environment" {
type = "string"
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "subscription_id" {
description = "Azure account id used as filter for monitors"
type = "string"
}
variable "provider" {
description = "Cloud provider which the monitor and its based metric depend on"
type = "string"
default = "azure"
}
variable "service" {
description = "Service monitored by this set of monitors"
type = "string"
default = "storage"
}
# Global DataDog
variable "message" {
description = "Message sent when a Redis monitor is triggered"
@ -35,11 +14,16 @@ variable "delay" {
default = 600
}
variable "use_filter_tags" {
description = "Filter the data with service tags if true"
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
# Azure Stream Analytics specific
variable "su_utilization_threshold_warning" {
description = "Streaming Unit utilization rate limit (warning threshold)"
@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" {
default = 0
}
variable "function_requests_threshold_critical" {
variable "failed_function_requests_threshold_critical" {
description = "Failed Function Request rate limit (critical threshold)"
default = 10
}

View File

@ -2,12 +2,35 @@ data "template_file" "filter" {
template = "$${filter}"
vars {
filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
}
}
resource "datadog_monitor" "status" {
name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
message = "${var.message}"
query = <<EOF
avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
EOF
type = "metric alert"
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = "${var.delay}"
no_data_timeframe = 20
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "su_utilization" {
name = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
name = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
message = "${var.message}"
query = <<EOF
@ -15,11 +38,11 @@ resource "datadog_monitor" "su_utilization" {
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.su_utilization_threshold_critical}
EOF
type = "query alert"
type = "metric alert"
notify_no_data = "${var.notify_no_data}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" {
critical = "${var.su_utilization_threshold_critical}"
}
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "failed_function_requests" {
name = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}"
name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
message = "${var.message}"
query = <<EOF
avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
) * 100 > ${var.function_requests_threshold_critical}
) * 100 > ${var.failed_function_requests_threshold_critical}
EOF
type = "query alert"
type = "metric alert"
notify_no_data = "${var.notify_no_data}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
notify_audit = false
@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" {
no_data_timeframe = 20
thresholds {
warning = "${var.function_requests_threshold_warning}"
critical = "${var.function_requests_threshold_critical}"
critical = "${var.failed_function_requests_threshold_critical}"
}
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "conversion_errors" {
name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
# Hard Coded Message while we don't know how to configure warning and critical thresholds
message = "@FR-CloudPublic-run@fr.clara.net"
message = "${var.message}"
query = <<EOF
avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.conversion_errors_threshold_critical}
EOF
type = "query alert"
type = "metric alert"
notify_no_data = "${var.notify_no_data}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" {
critical = "${var.conversion_errors_threshold_critical}"
}
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "runtime_errors" {
name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
# Hard Coded Message while we don't know how to configure warning and critical thresholds
message = "@FR-CloudPublic-run@fr.clara.net"
message = "${var.message}"
query = <<EOF
avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.runtime_errors_threshold_critical}
EOF
type = "query alert"
type = "metric alert"
notify_no_data = "${var.notify_no_data}"
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" {
critical = "${var.runtime_errors_threshold_critical}"
}
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}