diff --git a/cloud/azure/README.md b/cloud/azure/README.md index e6785e7..fd33024 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -206,10 +206,24 @@ Inputs | redis_status_silenced | Groups to mute for Redis status monitor | map | `` | no | | redis_status_time_aggregator | Monitor aggregator for Redis status [available values: min, max or avg] | string | `max` | no | | redis_status_timeframe | Monitor timeframe for Redis status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| servicebus_no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | +| servicebus_no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | +| servicebus_no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | +| servicebus_no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| servicebus_server_errors_message | Custom message for Service Bus server errors monitor | string | `` | no | +| servicebus_server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `` | no | +| servicebus_server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | +| servicebus_server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | +| servicebus_server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | servicebus_status_message | Custom message for Service Bus status monitor | string | `` | no | | servicebus_status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | servicebus_status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | | servicebus_status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| servicebus_user_errors_message | Custom message for Service Bus user errors monitor | string | `` | no | +| servicebus_user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `` | no | +| servicebus_user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no | +| servicebus_user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no | +| servicebus_user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | sqldatabase_cpu_message | Custom message for SQL CPU monitor | string | `` | no | | sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `` | no | | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 40a94cf..e8d04bf 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -22,11 +22,6 @@ variable "delay" { default = 900 } -variable "subscription_id" { - description = "ID of the subscription" - type = "string" -} - # Azure CosmosDB specific variables variable "cosmos_db_4xx_requests_message" { description = "Custom message for Cosmos DB 4xx requests monitor" diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 5237c0a..fb4b916 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -12,17 +12,17 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} EOF @@ -55,9 +55,9 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} EOF @@ -90,7 +90,7 @@ resource "datadog_monitor" "cosmos_db_success_no_data" { query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} EOF diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index b477cd1..8227fa2 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -930,6 +930,86 @@ variable "servicebus_status_timeframe" { default = "last_15m" } +variable "servicebus_no_active_connections_silenced" { + description = "Groups to mute for Service Bus status monitor" + type = "map" + default = {} +} + +variable "servicebus_no_active_connections_message" { + description = "Custom message for Service Bus status monitor" + type = "string" + default = "" +} + +variable "servicebus_no_active_connections_time_aggregator" { + description = "Monitor aggregator for Service Bus status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "servicebus_no_active_connections_timeframe" { + description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "servicebus_server_errors_message" { + description = "Custom message for Service Bus server errors monitor" + type = "string" + default = "" +} + +variable "servicebus_server_errors_silenced" { + description = "Groups to mute for Service Bus server errors monitor" + type = "map" + default = {} +} + +variable "servicebus_server_errors_timeframe" { + description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "servicebus_server_errors_threshold_critical" { + description = "Critical threshold for Service Bus server errors monitor" + default = 90 +} + +variable "servicebus_server_errors_threshold_warning" { + description = "Warning threshold for Service Bus server errors monitor" + default = 50 +} + +variable "servicebus_user_errors_message" { + description = "Custom message for Service Bus user errors monitor" + type = "string" + default = "" +} + +variable "servicebus_user_errors_silenced" { + description = "Groups to mute for Service Bus user errors monitor" + type = "map" + default = {} +} + +variable "servicebus_user_errors_timeframe" { + description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "servicebus_user_errors_threshold_critical" { + description = "Critical threshold for Service Bus user errors monitor" + default = 90 +} + +variable "servicebus_user_errors_threshold_warning" { + description = "Warning threshold for Service Bus user errors monitor" + default = 50 +} + # Azure SQL Database specific variables variable "sqldatabase_cpu_silenced" { description = "Groups to mute for SQL CPU monitor" diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 6738aa6..0716157 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -230,6 +230,23 @@ module "servicebus" { status_message = "${var.servicebus_status_message}" status_timeframe = "${var.servicebus_status_timeframe}" status_time_aggregator = "${var.servicebus_status_time_aggregator}" + + no_active_connections_silenced = "${var.servicebus_no_active_connections_silenced}" + no_active_connections_message = "${var.servicebus_no_active_connections_message}" + no_active_connections_timeframe = "${var.servicebus_no_active_connections_timeframe}" + no_active_connections_time_aggregator = "${var.servicebus_no_active_connections_time_aggregator}" + + server_errors_silenced = "${var.servicebus_server_errors_silenced}" + server_errors_message = "${var.servicebus_server_errors_message}" + server_errors_timeframe = "${var.servicebus_server_errors_timeframe}" + server_errors_threshold_critical = "${var.servicebus_server_errors_threshold_critical}" + server_errors_threshold_warning = "${var.servicebus_server_errors_threshold_warning}" + + user_errors_silenced = "${var.servicebus_user_errors_silenced}" + user_errors_message = "${var.servicebus_user_errors_message}" + user_errors_timeframe = "${var.servicebus_user_errors_timeframe}" + user_errors_threshold_critical = "${var.servicebus_user_errors_threshold_critical}" + user_errors_threshold_warning = "${var.servicebus_user_errors_threshold_warning}" } module "sqldatabase" { diff --git a/cloud/azure/servicebus/README.md b/cloud/azure/servicebus/README.md index 052aab1..8c3fa7e 100644 --- a/cloud/azure/servicebus/README.md +++ b/cloud/azure/servicebus/README.md @@ -16,7 +16,10 @@ module "datadog-monitors-cloud-azure-servicebus" { Creates DataDog monitors with the following checks: -- Service Bus is down +- Service status check +- No active connection +- Server errors rate +- User errors rate ## Inputs @@ -27,12 +30,26 @@ Creates DataDog monitors with the following checks: | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | +| no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | +| no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | +| no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | +| no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| server_errors_message | Custom message for Service Bus server errors monitor | string | `` | no | +| server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `` | no | +| server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | +| server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | +| server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | | status_extra_tags | Extra tags for Service Bus status monitor | list | `` | no | | status_message | Custom message for Service Bus status monitor | string | `` | no | | status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | | status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| user_errors_message | Custom message for Service Bus user errors monitor | string | `` | no | +| user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `` | no | +| user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no | +| user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no | +| user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | ## Outputs diff --git a/cloud/azure/servicebus/inputs.tf b/cloud/azure/servicebus/inputs.tf index 680b606..ca8bbc4 100644 --- a/cloud/azure/servicebus/inputs.tf +++ b/cloud/azure/servicebus/inputs.tf @@ -58,3 +58,83 @@ variable "status_timeframe" { description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_15m" } + +variable "no_active_connections_silenced" { + description = "Groups to mute for Service Bus status monitor" + type = "map" + default = {} +} + +variable "no_active_connections_message" { + description = "Custom message for Service Bus status monitor" + type = "string" + default = "" +} + +variable "no_active_connections_time_aggregator" { + description = "Monitor aggregator for Service Bus status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "no_active_connections_timeframe" { + description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "server_errors_message" { + description = "Custom message for Service Bus server errors monitor" + type = "string" + default = "" +} + +variable "server_errors_silenced" { + description = "Groups to mute for Service Bus server errors monitor" + type = "map" + default = {} +} + +variable "server_errors_timeframe" { + description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "server_errors_threshold_critical" { + description = "Critical threshold for Service Bus server errors monitor" + default = 90 +} + +variable "server_errors_threshold_warning" { + description = "Warning threshold for Service Bus server errors monitor" + default = 50 +} + +variable "user_errors_message" { + description = "Custom message for Service Bus user errors monitor" + type = "string" + default = "" +} + +variable "user_errors_silenced" { + description = "Groups to mute for Service Bus user errors monitor" + type = "map" + default = {} +} + +variable "user_errors_timeframe" { + description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "user_errors_threshold_critical" { + description = "Critical threshold for Service Bus user errors monitor" + default = 90 +} + +variable "user_errors_threshold_warning" { + description = "Warning threshold for Service Bus user errors monitor" + default = 50 +} diff --git a/cloud/azure/servicebus/monitors-service-bus.tf b/cloud/azure/servicebus/monitors-service-bus.tf index dc9d1ea..79acaf2 100644 --- a/cloud/azure/servicebus/monitors-service-bus.tf +++ b/cloud/azure/servicebus/monitors-service-bus.tf @@ -24,3 +24,98 @@ EOF tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:servicebus", "team:claranet", "created-by:terraform", "${var.status_extra_tags}"] } + +resource "datadog_monitor" "service_bus_no_active_connections" { + name = "[${var.environment}] Service Bus has no active connection" + message = "${coalesce(var.no_active_connections_message, var.message)}" + + query = < ${var.user_errors_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.user_errors_threshold_critical}" + warning = "${var.user_errors_threshold_warning}" + } + + silenced = "${var.user_errors_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + + tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "service_bus_server_errors" { + name = "[${var.environment}] Service Bus server errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.server_errors_message, var.message)}" + + query = < ${var.server_errors_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.server_errors_threshold_critical}" + warning = "${var.server_errors_threshold_warning}" + } + + silenced = "${var.server_errors_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + + tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] +}