From 6fca9676c72ef6afd47a3a0fe0655232c07ed0bf Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 25 Feb 2019 11:28:23 +0100 Subject: [PATCH] MON-385 Monitors for Azure Event Grid --- README.md | 1 + cloud/azure/eventgrid/README.md | 70 +++++++++ cloud/azure/eventgrid/inputs.tf | 164 ++++++++++++++++++++ cloud/azure/eventgrid/modules.tf | 9 ++ cloud/azure/eventgrid/monitors-eventgrid.tf | 101 ++++++++++++ cloud/azure/eventgrid/outputs.tf | 14 ++ 6 files changed, 359 insertions(+) create mode 100644 cloud/azure/eventgrid/README.md create mode 100644 cloud/azure/eventgrid/inputs.tf create mode 100644 cloud/azure/eventgrid/modules.tf create mode 100644 cloud/azure/eventgrid/monitors-eventgrid.tf create mode 100644 cloud/azure/eventgrid/outputs.tf diff --git a/README.md b/README.md index 68d4d20..8f4a8f2 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [azure-search](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/) - [cosmosdb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/) - [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/) + - [eventgrid](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventgrid/) - [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/) - [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/) - [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/) diff --git a/cloud/azure/eventgrid/README.md b/cloud/azure/eventgrid/README.md new file mode 100644 index 0000000..2a387b2 --- /dev/null +++ b/cloud/azure/eventgrid/README.md @@ -0,0 +1,70 @@ +# CLOUD AZURE EVENTGRID DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-azure-eventgrid" { + source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/eventgrid?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Event Grid no successful message +- Event Grid too many failed messages +- Event Grid too many unmatched events +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| failed\_messages\_rate\_enabled | Flag to enable Event Grid failed messages monitor | string | `"true"` | no | +| failed\_messages\_rate\_extra\_tags | Extra tags for Event Grid failed messages monitor | list | `[]` | no | +| failed\_messages\_rate\_message | Custom message for Event Grid failed messages monitor | string | `""` | no | +| failed\_messages\_rate\_silenced | Groups to mute for Event Grid failed messages monitor | map | `{}` | no | +| failed\_messages\_rate\_thresold\_critical | Failed messages ratio (percentage) to trigger the critical alert | string | `"90"` | no | +| failed\_messages\_rate\_thresold\_warning | Failed messages ratio (percentage) to trigger a warning alert | string | `"50"` | no | +| failed\_messages\_rate\_time\_aggregator | Monitor aggregator for Event Grid failed messages [available values: min, max or avg] | string | `"min"` | no | +| failed\_messages\_rate\_timeframe | Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| message | Message sent when an alert is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| no\_successful\_message\_rate\_enabled | Flag to enable Event Grid no successful message monitor | string | `"true"` | no | +| no\_successful\_message\_rate\_extra\_tags | Extra tags for Event Grid no successful message monitor | list | `[]` | no | +| no\_successful\_message\_rate\_message | Custom message for Event Grid no successful message monitor | string | `""` | no | +| no\_successful\_message\_rate\_silenced | Groups to mute for²id no successful message monitor | map | `{}` | no | +| no\_successful\_message\_rate\_time\_aggregator | Monitor aggregator for Event Grid no successful message [available values: min, max or avg] | string | `"min"` | no | +| no\_successful\_message\_rate\_timeframe | Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| unmatched\_events\_rate\_enabled | Flag to enable Event Grid unmatched events monitor | string | `"true"` | no | +| unmatched\_events\_rate\_extra\_tags | Extra tags for Event Grid unmatched events monitor | list | `[]` | no | +| unmatched\_events\_rate\_message | Custom message for Event Grid unmatched events monitor | string | `""` | no | +| unmatched\_events\_rate\_silenced | Groups to mute for Event Grid unmatched events monitor | map | `{}` | no | +| unmatched\_events\_rate\_thresold\_critical | Unmatched events ratio (percentage) to trigger the critical alert | string | `"90"` | no | +| unmatched\_events\_rate\_thresold\_warning | Unmatched events ratio (percentage) to trigger a warning alert | string | `"50"` | no | +| unmatched\_events\_rate\_time\_aggregator | Monitor aggregator for Event Grid unmatched events [available values: min, max or avg] | string | `"min"` | no | +| unmatched\_events\_rate\_timeframe | Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| eventgrid\_failed\_messages\_id | id for monitor eventgrid_failed_messages | +| eventgrid\_no\_successful\_message\_id | id for monitor eventgrid_no_successful_message | +| eventgrid\_unmatched\_events\_id | id for monitor eventgrid_unmatched_events | + +## Related documentation + +Datadog Azure documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) + +Azure "Monitor event delivery" documentation: [https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery](https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery) + +Azure Monitor metrics: [https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics) diff --git a/cloud/azure/eventgrid/inputs.tf b/cloud/azure/eventgrid/inputs.tf new file mode 100644 index 0000000..0776037 --- /dev/null +++ b/cloud/azure/eventgrid/inputs.tf @@ -0,0 +1,164 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +# Azure Event Grid specific variables +variable "no_successful_message_rate_silenced" { + description = "Groups to mute for²id no successful message monitor" + type = "map" + default = {} +} + +variable "no_successful_message_rate_enabled" { + description = "Flag to enable Event Grid no successful message monitor" + type = "string" + default = "true" +} + +variable "no_successful_message_rate_extra_tags" { + description = "Extra tags for Event Grid no successful message monitor" + type = "list" + default = [] +} + +variable "no_successful_message_rate_message" { + description = "Custom message for Event Grid no successful message monitor" + type = "string" + default = "" +} + +variable "no_successful_message_rate_time_aggregator" { + description = "Monitor aggregator for Event Grid no successful message [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "no_successful_message_rate_timeframe" { + description = "Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "failed_messages_rate_silenced" { + description = "Groups to mute for Event Grid failed messages monitor" + type = "map" + default = {} +} + +variable "failed_messages_rate_enabled" { + description = "Flag to enable Event Grid failed messages monitor" + type = "string" + default = "true" +} + +variable "failed_messages_rate_extra_tags" { + description = "Extra tags for Event Grid failed messages monitor" + type = "list" + default = [] +} + +variable "failed_messages_rate_message" { + description = "Custom message for Event Grid failed messages monitor" + type = "string" + default = "" +} + +variable "failed_messages_rate_time_aggregator" { + description = "Monitor aggregator for Event Grid failed messages [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "failed_messages_rate_timeframe" { + description = "Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "failed_messages_rate_thresold_critical" { + description = "Failed messages ratio (percentage) to trigger the critical alert" + default = 90 +} + +variable "failed_messages_rate_thresold_warning" { + description = "Failed messages ratio (percentage) to trigger a warning alert" + default = 50 +} + +variable "unmatched_events_rate_silenced" { + description = "Groups to mute for Event Grid unmatched events monitor" + type = "map" + default = {} +} + +variable "unmatched_events_rate_enabled" { + description = "Flag to enable Event Grid unmatched events monitor" + type = "string" + default = "true" +} + +variable "unmatched_events_rate_extra_tags" { + description = "Extra tags for Event Grid unmatched events monitor" + type = "list" + default = [] +} + +variable "unmatched_events_rate_message" { + description = "Custom message for Event Grid unmatched events monitor" + type = "string" + default = "" +} + +variable "unmatched_events_rate_time_aggregator" { + description = "Monitor aggregator for Event Grid unmatched events [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "unmatched_events_rate_timeframe" { + description = "Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "unmatched_events_rate_thresold_critical" { + description = "Unmatched events ratio (percentage) to trigger the critical alert" + default = 90 +} + +variable "unmatched_events_rate_thresold_warning" { + description = "Unmatched events ratio (percentage) to trigger a warning alert" + default = 50 +} diff --git a/cloud/azure/eventgrid/modules.tf b/cloud/azure/eventgrid/modules.tf new file mode 100644 index 0000000..609d2fb --- /dev/null +++ b/cloud/azure/eventgrid/modules.tf @@ -0,0 +1,9 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "eventgrid" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" +} diff --git a/cloud/azure/eventgrid/monitors-eventgrid.tf b/cloud/azure/eventgrid/monitors-eventgrid.tf new file mode 100644 index 0000000..6e96599 --- /dev/null +++ b/cloud/azure/eventgrid/monitors-eventgrid.tf @@ -0,0 +1,101 @@ +resource "datadog_monitor" "eventgrid_no_successful_message" { + count = "${var.no_successful_message_rate_enabled ? 1 : 0}" + name = "[${var.environment}] Event Grid no successful message {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.no_successful_message_rate_message, var.message)}" + + # Query is a bit weird, but we only want to check the no-data + query = < ${var.failed_messages_rate_thresold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.failed_messages_rate_thresold_critical}" + warning = "${var.failed_messages_rate_thresold_warning}" + } + + silenced = "${var.failed_messages_rate_silenced}" + + notify_no_data = false + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.new_host_delay}" + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.failed_messages_rate_extra_tags}"] +} + +resource "datadog_monitor" "eventgrid_unmatched_events" { + count = "${var.unmatched_events_rate_enabled ? 1 : 0}" + name = "[${var.environment}] Event Grid too many unmatched events {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.unmatched_events_rate_message, var.message)}" + + query = < ${var.unmatched_events_rate_thresold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.unmatched_events_rate_thresold_critical}" + warning = "${var.unmatched_events_rate_thresold_warning}" + } + + silenced = "${var.unmatched_events_rate_silenced}" + + notify_no_data = false + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.new_host_delay}" + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.unmatched_events_rate_extra_tags}"] +} diff --git a/cloud/azure/eventgrid/outputs.tf b/cloud/azure/eventgrid/outputs.tf new file mode 100644 index 0000000..e209934 --- /dev/null +++ b/cloud/azure/eventgrid/outputs.tf @@ -0,0 +1,14 @@ +output "eventgrid_no_successful_message_id" { + description = "id for monitor eventgrid_no_successful_message" + value = "${datadog_monitor.eventgrid_no_successful_message.*.id}" +} + +output "eventgrid_failed_messages_id" { + description = "id for monitor eventgrid_failed_messages" + value = "${datadog_monitor.eventgrid_failed_messages.*.id}" +} + +output "eventgrid_unmatched_events_id" { + description = "id for monitor eventgrid_unmatched_events" + value = "${datadog_monitor.eventgrid_unmatched_events.*.id}" +}