MON-78 Normalize monitors & add status monitor

This commit is contained in:
Laurent Piroelle 2017-11-23 15:51:23 +01:00
parent 31f033c35d
commit 8afae8b5f4
2 changed files with 53 additions and 48 deletions

View File

@ -4,27 +4,6 @@ variable "environment" {
type = "string" type = "string"
} }
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "subscription_id" {
description = "Azure account id used as filter for monitors"
type = "string"
}
variable "provider" {
description = "Cloud provider which the monitor and its based metric depend on"
type = "string"
default = "azure"
}
variable "service" {
description = "Service monitored by this set of monitors"
type = "string"
default = "storage"
}
# Global DataDog # Global DataDog
variable "message" { variable "message" {
description = "Message sent when a Redis monitor is triggered" description = "Message sent when a Redis monitor is triggered"
@ -35,11 +14,16 @@ variable "delay" {
default = 600 default = 600
} }
variable "use_filter_tags" { variable "filter_tags_use_defaults" {
description = "Filter the data with service tags if true" description = "Use default filter tags convention"
default = "true" default = "true"
} }
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
# Azure Stream Analytics specific # Azure Stream Analytics specific
variable "su_utilization_threshold_warning" { variable "su_utilization_threshold_warning" {
description = "Streaming Unit utilization rate limit (warning threshold)" description = "Streaming Unit utilization rate limit (warning threshold)"
@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" {
default = 0 default = 0
} }
variable "function_requests_threshold_critical" { variable "failed_function_requests_threshold_critical" {
description = "Failed Function Request rate limit (critical threshold)" description = "Failed Function Request rate limit (critical threshold)"
default = 10 default = 10
} }

View File

@ -2,12 +2,35 @@ data "template_file" "filter" {
template = "$${filter}" template = "$${filter}"
vars { vars {
filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
} }
} }
resource "datadog_monitor" "status" {
name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
message = "${var.message}"
query = <<EOF
avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
EOF
type = "metric alert"
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = "${var.delay}"
no_data_timeframe = 20
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "su_utilization" { resource "datadog_monitor" "su_utilization" {
name = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" name = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
message = "${var.message}" message = "${var.message}"
query = <<EOF query = <<EOF
@ -15,11 +38,11 @@ resource "datadog_monitor" "su_utilization" {
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.su_utilization_threshold_critical} ) > ${var.su_utilization_threshold_critical}
EOF EOF
type = "query alert" type = "metric alert"
notify_no_data = "${var.notify_no_data}" notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
renotify_interval = 60 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" {
critical = "${var.su_utilization_threshold_critical}" critical = "${var.su_utilization_threshold_critical}"
} }
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "failed_function_requests" {
name = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}" name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
message = "${var.message}" message = "${var.message}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
) * 100 > ${var.function_requests_threshold_critical} ) * 100 > ${var.failed_function_requests_threshold_critical}
EOF EOF
type = "query alert" type = "metric alert"
notify_no_data = "${var.notify_no_data}" notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
renotify_interval = 60 renotify_interval = 60
notify_audit = false notify_audit = false
@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" {
no_data_timeframe = 20 no_data_timeframe = 20
thresholds { thresholds {
warning = "${var.function_requests_threshold_warning}" warning = "${var.function_requests_threshold_warning}"
critical = "${var.function_requests_threshold_critical}" critical = "${var.failed_function_requests_threshold_critical}"
} }
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "conversion_errors" { resource "datadog_monitor" "conversion_errors" {
name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
# Hard Coded Message while we don't know how to configure warning and critical thresholds message = "${var.message}"
message = "@FR-CloudPublic-run@fr.clara.net"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.conversion_errors_threshold_critical} ) > ${var.conversion_errors_threshold_critical}
EOF EOF
type = "query alert" type = "metric alert"
notify_no_data = "${var.notify_no_data}" notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
renotify_interval = 60 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" {
critical = "${var.conversion_errors_threshold_critical}" critical = "${var.conversion_errors_threshold_critical}"
} }
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "runtime_errors" { resource "datadog_monitor" "runtime_errors" {
name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
# Hard Coded Message while we don't know how to configure warning and critical thresholds message = "${var.message}"
message = "@FR-CloudPublic-run@fr.clara.net"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
) > ${var.runtime_errors_threshold_critical} ) > ${var.runtime_errors_threshold_critical}
EOF EOF
type = "query alert" type = "metric alert"
notify_no_data = "${var.notify_no_data}" notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
renotify_interval = 60 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" {
critical = "${var.runtime_errors_threshold_critical}" critical = "${var.runtime_errors_threshold_critical}"
} }
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
} }