MON-77 Normalize monitors

This commit is contained in:
Laurent Piroelle 2017-11-23 16:50:04 +01:00
parent 5df915df51
commit 6c10a32ff3
3 changed files with 18 additions and 32 deletions

View File

@ -33,11 +33,9 @@ Inputs
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| provider | What is the monitored provider | string | azure | no |
| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
| subscription_id | Azure account id used as filter for monitors | string | - | yes |
| service | What is the monitored service | string | storage | no |
Outputs
-------

View File

@ -4,23 +4,6 @@ variable "environment" {
type = "string"
}
variable "subscription_id" {
description = "Azure account id used as filter for monitors"
type = "string"
}
variable "provider" {
description = "Cloud provider which the monitor and its based metric depend on"
type = "string"
default = "azure"
}
variable "service" {
description = "Service monitored by this set of monitors"
type = "string"
default = "storage"
}
# Global DataDog
variable "message" {
description = "Message sent when an alert is triggered"
@ -31,11 +14,16 @@ variable "delay" {
default = 600
}
variable "use_filter_tags" {
description = "Filter the data with service tags if true"
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "failed_requests_rate_thresold_critical" {
description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 3

View File

@ -2,12 +2,12 @@ data "template_file" "filter" {
template = "$${filter}"
vars {
filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}"
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
}
}
resource "datadog_monitor" "eventhub_status" {
name = "[${var.environment}] Event Hub status"
name = "[${var.environment}] Event Hub status is not ok on {{name}}"
message = "${var.message}"
query = <<EOF
@ -26,11 +26,11 @@ resource "datadog_monitor" "eventhub_status" {
new_host_delay = "${var.delay}"
no_data_timeframe = 20
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "eventhub_failed_requests" {
name = "[${var.environment}] Event Hub failed requests"
name = "[${var.environment}] Event Hub too much failed requests on {{name}}"
message = "${var.message}"
query = <<EOF
@ -41,7 +41,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
) > ${var.failed_requests_rate_thresold_critical}
EOF
type = "query alert"
type = "metric alert"
thresholds {
critical = "${var.failed_requests_rate_thresold_critical}"
@ -59,11 +59,11 @@ resource "datadog_monitor" "eventhub_failed_requests" {
new_host_delay = "${var.delay}"
no_data_timeframe = 20
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
}
resource "datadog_monitor" "eventhub_errors" {
name = "[${var.environment}] Event Hub errors"
name = "[${var.environment}] Event Hub too much errors on {{name}}"
message = "${var.message}"
query = <<EOF
@ -78,7 +78,7 @@ resource "datadog_monitor" "eventhub_errors" {
avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
) > ${var.errors_rate_thresold_critical}
EOF
type = "query alert"
type = "metric alert"
thresholds {
critical = "${var.errors_rate_thresold_critical}"
@ -96,5 +96,5 @@ resource "datadog_monitor" "eventhub_errors" {
new_host_delay = "${var.delay}"
no_data_timeframe = 20
tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"]
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
}