MON-385 Monitors for Azure Event Grid
This commit is contained in:
parent
f39d705a36
commit
6fca9676c7
@ -101,6 +101,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
|
||||
- [azure-search](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/)
|
||||
- [cosmosdb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/)
|
||||
- [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/)
|
||||
- [eventgrid](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventgrid/)
|
||||
- [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/)
|
||||
- [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/)
|
||||
- [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/)
|
||||
|
||||
70
cloud/azure/eventgrid/README.md
Normal file
70
cloud/azure/eventgrid/README.md
Normal file
@ -0,0 +1,70 @@
|
||||
# CLOUD AZURE EVENTGRID DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-azure-eventgrid" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/eventgrid?ref={revision}"
|
||||
|
||||
environment = "${var.environment}"
|
||||
message = "${module.datadog-message-alerting.alerting-message}"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- Event Grid no successful message
|
||||
- Event Grid too many failed messages
|
||||
- Event Grid too many unmatched events
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| failed\_messages\_rate\_enabled | Flag to enable Event Grid failed messages monitor | string | `"true"` | no |
|
||||
| failed\_messages\_rate\_extra\_tags | Extra tags for Event Grid failed messages monitor | list | `[]` | no |
|
||||
| failed\_messages\_rate\_message | Custom message for Event Grid failed messages monitor | string | `""` | no |
|
||||
| failed\_messages\_rate\_silenced | Groups to mute for Event Grid failed messages monitor | map | `{}` | no |
|
||||
| failed\_messages\_rate\_thresold\_critical | Failed messages ratio (percentage) to trigger the critical alert | string | `"90"` | no |
|
||||
| failed\_messages\_rate\_thresold\_warning | Failed messages ratio (percentage) to trigger a warning alert | string | `"50"` | no |
|
||||
| failed\_messages\_rate\_time\_aggregator | Monitor aggregator for Event Grid failed messages [available values: min, max or avg] | string | `"min"` | no |
|
||||
| failed\_messages\_rate\_timeframe | Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
||||
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
|
||||
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
|
||||
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
|
||||
| message | Message sent when an alert is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
|
||||
| no\_successful\_message\_rate\_enabled | Flag to enable Event Grid no successful message monitor | string | `"true"` | no |
|
||||
| no\_successful\_message\_rate\_extra\_tags | Extra tags for Event Grid no successful message monitor | list | `[]` | no |
|
||||
| no\_successful\_message\_rate\_message | Custom message for Event Grid no successful message monitor | string | `""` | no |
|
||||
| no\_successful\_message\_rate\_silenced | Groups to mute for²id no successful message monitor | map | `{}` | no |
|
||||
| no\_successful\_message\_rate\_time\_aggregator | Monitor aggregator for Event Grid no successful message [available values: min, max or avg] | string | `"min"` | no |
|
||||
| no\_successful\_message\_rate\_timeframe | Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
||||
| unmatched\_events\_rate\_enabled | Flag to enable Event Grid unmatched events monitor | string | `"true"` | no |
|
||||
| unmatched\_events\_rate\_extra\_tags | Extra tags for Event Grid unmatched events monitor | list | `[]` | no |
|
||||
| unmatched\_events\_rate\_message | Custom message for Event Grid unmatched events monitor | string | `""` | no |
|
||||
| unmatched\_events\_rate\_silenced | Groups to mute for Event Grid unmatched events monitor | map | `{}` | no |
|
||||
| unmatched\_events\_rate\_thresold\_critical | Unmatched events ratio (percentage) to trigger the critical alert | string | `"90"` | no |
|
||||
| unmatched\_events\_rate\_thresold\_warning | Unmatched events ratio (percentage) to trigger a warning alert | string | `"50"` | no |
|
||||
| unmatched\_events\_rate\_time\_aggregator | Monitor aggregator for Event Grid unmatched events [available values: min, max or avg] | string | `"min"` | no |
|
||||
| unmatched\_events\_rate\_timeframe | Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| eventgrid\_failed\_messages\_id | id for monitor eventgrid_failed_messages |
|
||||
| eventgrid\_no\_successful\_message\_id | id for monitor eventgrid_no_successful_message |
|
||||
| eventgrid\_unmatched\_events\_id | id for monitor eventgrid_unmatched_events |
|
||||
|
||||
## Related documentation
|
||||
|
||||
Datadog Azure documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
|
||||
|
||||
Azure "Monitor event delivery" documentation: [https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery](https://docs.microsoft.com/en-us/azure/event-grid/monitor-event-delivery)
|
||||
|
||||
Azure Monitor metrics: [https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsofteventgridtopics)
|
||||
164
cloud/azure/eventgrid/inputs.tf
Normal file
164
cloud/azure/eventgrid/inputs.tf
Normal file
@ -0,0 +1,164 @@
|
||||
# Global Terraform
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "message" {
|
||||
description = "Message sent when an alert is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds before monitor new resource"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom_excluded" {
|
||||
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||
default = ""
|
||||
}
|
||||
|
||||
# Azure Event Grid specific variables
|
||||
variable "no_successful_message_rate_silenced" {
|
||||
description = "Groups to mute for²id no successful message monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "no_successful_message_rate_enabled" {
|
||||
description = "Flag to enable Event Grid no successful message monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "no_successful_message_rate_extra_tags" {
|
||||
description = "Extra tags for Event Grid no successful message monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "no_successful_message_rate_message" {
|
||||
description = "Custom message for Event Grid no successful message monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "no_successful_message_rate_time_aggregator" {
|
||||
description = "Monitor aggregator for Event Grid no successful message [available values: min, max or avg]"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "no_successful_message_rate_timeframe" {
|
||||
description = "Monitor timeframe for Event Grid no successful message [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_silenced" {
|
||||
description = "Groups to mute for Event Grid failed messages monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_enabled" {
|
||||
description = "Flag to enable Event Grid failed messages monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_extra_tags" {
|
||||
description = "Extra tags for Event Grid failed messages monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_message" {
|
||||
description = "Custom message for Event Grid failed messages monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_time_aggregator" {
|
||||
description = "Monitor aggregator for Event Grid failed messages [available values: min, max or avg]"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_timeframe" {
|
||||
description = "Monitor timeframe for Event Grid failed messages [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_thresold_critical" {
|
||||
description = "Failed messages ratio (percentage) to trigger the critical alert"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_messages_rate_thresold_warning" {
|
||||
description = "Failed messages ratio (percentage) to trigger a warning alert"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_silenced" {
|
||||
description = "Groups to mute for Event Grid unmatched events monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_enabled" {
|
||||
description = "Flag to enable Event Grid unmatched events monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_extra_tags" {
|
||||
description = "Extra tags for Event Grid unmatched events monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_message" {
|
||||
description = "Custom message for Event Grid unmatched events monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_time_aggregator" {
|
||||
description = "Monitor aggregator for Event Grid unmatched events [available values: min, max or avg]"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_timeframe" {
|
||||
description = "Monitor timeframe for Event Grid unmatched events [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = "string"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_thresold_critical" {
|
||||
description = "Unmatched events ratio (percentage) to trigger the critical alert"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "unmatched_events_rate_thresold_warning" {
|
||||
description = "Unmatched events ratio (percentage) to trigger a warning alert"
|
||||
default = 50
|
||||
}
|
||||
9
cloud/azure/eventgrid/modules.tf
Normal file
9
cloud/azure/eventgrid/modules.tf
Normal file
@ -0,0 +1,9 @@
|
||||
module "filter-tags" {
|
||||
source = "../../../common/filter-tags"
|
||||
|
||||
environment = "${var.environment}"
|
||||
resource = "eventgrid"
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}"
|
||||
}
|
||||
101
cloud/azure/eventgrid/monitors-eventgrid.tf
Normal file
101
cloud/azure/eventgrid/monitors-eventgrid.tf
Normal file
@ -0,0 +1,101 @@
|
||||
resource "datadog_monitor" "eventgrid_no_successful_message" {
|
||||
count = "${var.no_successful_message_rate_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Event Grid no successful message {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.no_successful_message_rate_message, var.message)}"
|
||||
|
||||
# Query is a bit weird, but we only want to check the no-data
|
||||
query = <<EOF
|
||||
${var.no_successful_message_rate_time_aggregator}(${var.no_successful_message_rate_timeframe}):
|
||||
avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name} < 0
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.no_successful_message_rate_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.no_successful_message_rate_extra_tags}"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventgrid_failed_messages" {
|
||||
count = "${var.failed_messages_rate_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Event Grid too many failed messages {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_messages_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.failed_messages_rate_time_aggregator}(${var.failed_messages_rate_timeframe}): (default(
|
||||
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
||||
) * 100, 0)
|
||||
) > ${var.failed_messages_rate_thresold_critical}
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
thresholds {
|
||||
critical = "${var.failed_messages_rate_thresold_critical}"
|
||||
warning = "${var.failed_messages_rate_thresold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_messages_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.failed_messages_rate_extra_tags}"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventgrid_unmatched_events" {
|
||||
count = "${var.unmatched_events_rate_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Event Grid too many unmatched events {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.unmatched_events_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
|
||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
||||
) * 100, 0)
|
||||
) > ${var.unmatched_events_rate_thresold_critical}
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
thresholds {
|
||||
critical = "${var.unmatched_events_rate_thresold_critical}"
|
||||
warning = "${var.unmatched_events_rate_thresold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.unmatched_events_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:eventgrid", "team:claranet", "created-by:terraform", "${var.unmatched_events_rate_extra_tags}"]
|
||||
}
|
||||
14
cloud/azure/eventgrid/outputs.tf
Normal file
14
cloud/azure/eventgrid/outputs.tf
Normal file
@ -0,0 +1,14 @@
|
||||
output "eventgrid_no_successful_message_id" {
|
||||
description = "id for monitor eventgrid_no_successful_message"
|
||||
value = "${datadog_monitor.eventgrid_no_successful_message.*.id}"
|
||||
}
|
||||
|
||||
output "eventgrid_failed_messages_id" {
|
||||
description = "id for monitor eventgrid_failed_messages"
|
||||
value = "${datadog_monitor.eventgrid_failed_messages.*.id}"
|
||||
}
|
||||
|
||||
output "eventgrid_unmatched_events_id" {
|
||||
description = "id for monitor eventgrid_unmatched_events"
|
||||
value = "${datadog_monitor.eventgrid_unmatched_events.*.id}"
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user