MON-385 Monitors for Azure Event Grid

This commit is contained in:
Laurent Piroelle 2019-02-25 11:28:23 +01:00 committed by Quentin Manfroi
parent f39d705a36
commit 6fca9676c7
6 changed files with 359 additions and 0 deletions

View File

@ -101,6 +101,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [azure-search](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/) - [azure-search](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/)
- [cosmosdb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/) - [cosmosdb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/)
- [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/) - [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/)
- [eventgrid](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventgrid/)
- [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/) - [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/)
- [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/) - [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/)
- [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/) - [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/)

View File

@ -0,0 +1,70 @@
# CLOUD AZURE EVENTGRID DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-azure-eventgrid" {
source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/eventgrid?ref={revision}"
environment = "${var.environment}"
message = "${module.datadog-message-alerting.alerting-message}"
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Event Grid no successful message
- Event Grid too many failed messages
- Event Grid too many unmatched events
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| failed\_messages\_rate\_enabled | Flag to enable Event Grid failed messages monitor | string | `"true"` | no |
| failed\_messages\_rate\_extra\_tags | Extra tags for Event Grid failed messages monitor | list | `[]` | no |
| failed\_messages\_rate\_message | Custom message for Event Grid failed messages monitor | string | `""` | no |
| failed\_messages\_rate\_silenced | Groups to mute for Event Grid failed messages monitor | map | `{}` | no |
| failed\_messages\_rate\_thresold\_critical | Failed messages ratio (percentage) to trigger the critical alert | string | `"90"` | no |
| failed\_messages\_rate\_thresold\_warning | Failed messages ratio (percentage) to trigger a warning alert | string | `"50"` | no |
| failed\_messages\_rate\_time\_aggregator | Monitor aggregator for Event Grid failed messages [available values: min, max or avg] | string | `"min"` | no |
| failed\_messages\_rate\_timeframe | Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| message | Message sent when an alert is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| no\_successful\_message\_rate\_enabled | Flag to enable Event Grid no successful message monitor | string | `"true"` | no |
| no\_successful\_message\_rate\_extra\_tags | Extra tags for Event Grid no successful message monitor | list | `[]` | no |
| no\_successful\_message\_rate\_message | Custom message for Event Grid no successful message monitor | string | `""` | no |
| no\_successful\_message\_rate\_silenced | Groups to mute for²id no successful message monitor | map | `{}` | no |
| no\_successful\_message\_rate\_time\_aggregator | Monitor aggregator for Event Grid no successful message [available values: min, max or avg] | string | `"min"` | no |
| no\_successful\_message\_rate\_timeframe | Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| unmatched\_events\_rate\_enabled | Flag to enable Event Grid unmatched events monitor | string | `"true"` | no |
| unmatched\_events\_rate\_extra\_tags | Extra tags for Event Grid unmatched events monitor | list | `[]` | no |
| unmatched\_events\_rate\_message | Custom message for Event Grid unmatched events monitor | string | `""` | no |
| unmatched\_events\_rate\_silenced | Groups to mute for Event Grid unmatched events monitor | map | `{}` | no |
| unmatched\_events\_rate\_thresold\_critical | Unmatched events ratio (percentage) to trigger the critical alert | string | `"90"` | no |
| unmatched\_events\_rate\_thresold\_warning | Unmatched events ratio (percentage) to trigger a warning alert | string | `"50"` | no |
| unmatched\_events\_rate\_time\_aggregator | Monitor aggregator for Event Grid unmatched events [available values: min, max or avg] | string | `"min"` | no |
| unmatched\_events\_rate\_timeframe | Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
## Outputs
| Name | Description |
|------|-------------|
| eventgrid\_failed\_messages\_id | id for monitor eventgrid_failed_messages |
| eventgrid\_no\_successful\_message\_id | id for monitor eventgrid_no_successful_message |
| eventgrid\_unmatched\_events\_id | id for monitor eventgrid_unmatched_events |
## Related documentation
Datadog Azure documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
Azure "Monitor event delivery" documentation: [https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery](https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery)
Azure Monitor metrics: [https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics)

View File

@ -0,0 +1,164 @@
# Global Terraform
variable "environment" {
description = "Architecture environment"
type = "string"
}
# Global DataDog
variable "message" {
description = "Message sent when an alert is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
# Azure Event Grid specific variables
variable "no_successful_message_rate_silenced" {
description = "Groups to mute for²id no successful message monitor"
type = "map"
default = {}
}
variable "no_successful_message_rate_enabled" {
description = "Flag to enable Event Grid no successful message monitor"
type = "string"
default = "true"
}
variable "no_successful_message_rate_extra_tags" {
description = "Extra tags for Event Grid no successful message monitor"
type = "list"
default = []
}
variable "no_successful_message_rate_message" {
description = "Custom message for Event Grid no successful message monitor"
type = "string"
default = ""
}
variable "no_successful_message_rate_time_aggregator" {
description = "Monitor aggregator for Event Grid no successful message [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "no_successful_message_rate_timeframe" {
description = "Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "failed_messages_rate_silenced" {
description = "Groups to mute for Event Grid failed messages monitor"
type = "map"
default = {}
}
variable "failed_messages_rate_enabled" {
description = "Flag to enable Event Grid failed messages monitor"
type = "string"
default = "true"
}
variable "failed_messages_rate_extra_tags" {
description = "Extra tags for Event Grid failed messages monitor"
type = "list"
default = []
}
variable "failed_messages_rate_message" {
description = "Custom message for Event Grid failed messages monitor"
type = "string"
default = ""
}
variable "failed_messages_rate_time_aggregator" {
description = "Monitor aggregator for Event Grid failed messages [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "failed_messages_rate_timeframe" {
description = "Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "failed_messages_rate_thresold_critical" {
description = "Failed messages ratio (percentage) to trigger the critical alert"
default = 90
}
variable "failed_messages_rate_thresold_warning" {
description = "Failed messages ratio (percentage) to trigger a warning alert"
default = 50
}
variable "unmatched_events_rate_silenced" {
description = "Groups to mute for Event Grid unmatched events monitor"
type = "map"
default = {}
}
variable "unmatched_events_rate_enabled" {
description = "Flag to enable Event Grid unmatched events monitor"
type = "string"
default = "true"
}
variable "unmatched_events_rate_extra_tags" {
description = "Extra tags for Event Grid unmatched events monitor"
type = "list"
default = []
}
variable "unmatched_events_rate_message" {
description = "Custom message for Event Grid unmatched events monitor"
type = "string"
default = ""
}
variable "unmatched_events_rate_time_aggregator" {
description = "Monitor aggregator for Event Grid unmatched events [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "unmatched_events_rate_timeframe" {
description = "Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "unmatched_events_rate_thresold_critical" {
description = "Unmatched events ratio (percentage) to trigger the critical alert"
default = 90
}
variable "unmatched_events_rate_thresold_warning" {
description = "Unmatched events ratio (percentage) to trigger a warning alert"
default = 50
}

View File

@ -0,0 +1,9 @@
module "filter-tags" {
source = "../../../common/filter-tags"
environment = "${var.environment}"
resource = "eventgrid"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
filter_tags_custom = "${var.filter_tags_custom}"
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}"
}

View File

@ -0,0 +1,101 @@
resource "datadog_monitor" "eventgrid_no_successful_message" {
count = "${var.no_successful_message_rate_enabled ? 1 : 0}"
name = "[${var.environment}] Event Grid no successful message {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.no_successful_message_rate_message, var.message)}"
# Query is a bit weird, but we only want to check the no-data
query = <<EOF
${var.no_successful_message_rate_time_aggregator}(${var.no_successful_message_rate_timeframe}):
avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name} < 0
EOF
type = "metric alert"
silenced = "${var.no_successful_message_rate_silenced}"
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.no_successful_message_rate_extra_tags}"]
}
resource "datadog_monitor" "eventgrid_failed_messages" {
count = "${var.failed_messages_rate_enabled ? 1 : 0}"
name = "[${var.environment}] Event Grid too many failed messages {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.failed_messages_rate_message, var.message)}"
query = <<EOF
${var.failed_messages_rate_time_aggregator}(${var.failed_messages_rate_timeframe}): (default(
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
) * 100, 0)
) > ${var.failed_messages_rate_thresold_critical}
EOF
type = "metric alert"
thresholds {
critical = "${var.failed_messages_rate_thresold_critical}"
warning = "${var.failed_messages_rate_thresold_warning}"
}
silenced = "${var.failed_messages_rate_silenced}"
notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.failed_messages_rate_extra_tags}"]
}
resource "datadog_monitor" "eventgrid_unmatched_events" {
count = "${var.unmatched_events_rate_enabled ? 1 : 0}"
name = "[${var.environment}] Event Grid too many unmatched events {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.unmatched_events_rate_message, var.message)}"
query = <<EOF
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
) * 100, 0)
) > ${var.unmatched_events_rate_thresold_critical}
EOF
type = "metric alert"
thresholds {
critical = "${var.unmatched_events_rate_thresold_critical}"
warning = "${var.unmatched_events_rate_thresold_warning}"
}
silenced = "${var.unmatched_events_rate_silenced}"
notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.unmatched_events_rate_extra_tags}"]
}

View File

@ -0,0 +1,14 @@
output "eventgrid_no_successful_message_id" {
description = "id for monitor eventgrid_no_successful_message"
value = "${datadog_monitor.eventgrid_no_successful_message.*.id}"
}
output "eventgrid_failed_messages_id" {
description = "id for monitor eventgrid_failed_messages"
value = "${datadog_monitor.eventgrid_failed_messages.*.id}"
}
output "eventgrid_unmatched_events_id" {
description = "id for monitor eventgrid_unmatched_events"
value = "${datadog_monitor.eventgrid_unmatched_events.*.id}"
}