diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md new file mode 100644 index 0000000..0849152 --- /dev/null +++ b/cloud/azure/storage/README.md @@ -0,0 +1,58 @@ +Azure Storage DataDog monitors +============================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-storage" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/storage?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service availability +* End to end latency +* Minimum successful requests +* Maximum timeout error requests +* Maximum network error requests +* Maximum throttling error requests +* Maximum server other error requests +* Maximum client other error requests +* Maximum authorization error requests + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| message | Message sent when a monitor is triggered | string | - | yes | +| filter_tags_use_defaults | Use default tagging convention | string | `true` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| availability_threshold_critical | Minimum threshold of availability | string | `90` | no | +| successful_requests_threshold_critical | Minimum threshold of successful requests | string | `90` | no | +| latency_threshold_critical | Maximum threshold of latency in ms | string | `1000` | no | +| timeout_error_requests_threshold_critical | Maximum threshold of timeout error requests in percent | string | `35` | no | +| network_error_requests_threshold_critical | Maximum threshold of network error requests in percent | string | `35` | no | +| throttling_error_requests_threshold_critical | Maximum threshold of throttling error requests in percent | string | `50` | no | +| server_other_error_requests_threshold_critical | Maximum threshold of server other error requests in percent | string | `50` | no | +| client_other_error_requests_threshold_critical | Maximum threshold of client other error requests in percent | string | `75` | no | +| authorization_error_requests_threshold_critical | Maximum threshold of authorization error requests in percent | string | `75` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_storage/](https://docs.datadoghq.com/integrations/azure_storage/) + +DataDog blog: [https://www.datadoghq.com/blog/monitor-azure-storage-datadog/](https://www.datadoghq.com/blog/monitor-azure-storage-datadog/) + +Azure Redis metrics documentation: [https://docs.microsoft.com/en-us/azure/storage/common/storage-monitor-storage-account](https://docs.microsoft.com/en-us/azure/storage/common/storage-monitor-storage-account) + diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf new file mode 100644 index 0000000..5c512b5 --- /dev/null +++ b/cloud/azure/storage/inputs.tf @@ -0,0 +1,72 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure Storage specific +variable "availability_threshold_critical" { + description = "Minimum acceptable percent of availability for a storage" + default = 90 +} + +variable "successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests for a storage" + default = 90 +} + +variable "latency_threshold_critical" { + description = "Maximum acceptable end to end latency (ms) for a storage" + default = 1000 +} + +variable "timeout_error_requests_threshold_critical" { + description = "Maximum acceptable percent of timeout error requests for a storage" + default = 5 +} + +variable "network_error_requests_threshold_critical" { + description = "Maximum acceptable percent of network error requests for a storage" + default = 5 +} + +variable "throttling_error_requests_threshold_critical" { + description = "Maximum acceptable percent of throttling error requests for a storage" + default = 10 +} + +variable "server_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of server other error requests for a storage" + default = 10 +} + +variable "client_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of client other error requests for a storage" + default = 15 +} + +variable "authorization_error_requests_threshold_critical" { + description = "Maximum acceptable percent of authorization error requests for a storage" + default = 15 +} + diff --git a/cloud/azure/storage/monitors-azure-storage.tf b/cloud/azure/storage/monitors-azure-storage.tf new file mode 100644 index 0000000..7466798 --- /dev/null +++ b/cloud/azure/storage/monitors-azure-storage.tf @@ -0,0 +1,273 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "availability" { + name = "[${var.environment}] Azure Storage {{name}} unvailability detected" + message = "${var.message}" + + query = < ${var.latency_threshold_critical} +EOF + + thresholds { + critical = "${var.latency_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "timeout_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of timeout error requests on {{name}}" + message = "${var.message}" + + query = < ${var.timeout_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.timeout_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + + +resource "datadog_monitor" "network_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}" + message = "${var.message}" + + query = < ${var.network_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.network_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + + +resource "datadog_monitor" "throttling_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}" + message = "${var.message}" + + query = < ${var.throttling_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.throttling_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + + +resource "datadog_monitor" "server_other_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}" + message = "${var.message}" + + query = < ${var.server_other_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.server_other_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + + +resource "datadog_monitor" "client_other_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}" + message = "${var.message}" + + query = < ${var.client_other_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.client_other_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +} + + +resource "datadog_monitor" "authorization_error_requests" { + name = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}" + message = "${var.message}" + + query = < ${var.authorization_error_requests_threshold_critical} +EOF + + thresholds { + critical = "${var.authorization_error_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = true + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] +}