diff --git a/cloud/azure/README.md b/cloud/azure/README.md new file mode 100644 index 0000000..5d0cac8 --- /dev/null +++ b/cloud/azure/README.md @@ -0,0 +1,118 @@ +Azure monitors +============== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a set of Azure DataDog monitors for the following components : + +* Azure App Services monitors +* Azure SQL monitors +* Azure Redis monitors +* Azure Event Hub monitors +* Azure Stream Analytics monitors +* Azure Storage monitors +* Azure IOT Hub monitors +* Azure API Management monitors + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | +| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | +| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | +| appservices_http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | +| appservices_http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | +| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | +| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | +| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | +| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | +| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | +| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no | +| redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | +| redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | +| redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | +| redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | +| sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | +| sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | +| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | +| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | +| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | +| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | +| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | +| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | +| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | +| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | +| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | +| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | +| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | +| streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | +| streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) + +Azure metrics documentation: [https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics](https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics) diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md new file mode 100644 index 0000000..e59e81a --- /dev/null +++ b/cloud/azure/apimanagement/README.md @@ -0,0 +1,43 @@ +Azure API Management Datadog monitors +===================================== + +How to use this module +---------------------- +``` +module "datadog-monitors-azure-apimanagement" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/apimanagement?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates Datadog monitors with the following checks : + +* Service status +* Failed requests ratio +* Other requests ratio +* Unauthorized requests ratio +* Successful requests ratio + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | +| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | + +Related documentation +--------------------- + +Azure API Management metrics documentation: [https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor](https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor) diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf new file mode 100644 index 0000000..002593e --- /dev/null +++ b/cloud/azure/apimanagement/inputs.tf @@ -0,0 +1,46 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure API Management specific +variable "failed_requests_threshold_critical" { + description = "Maximum acceptable percent of failed requests" + default = 5 +} + +variable "other_requests_threshold_critical" { + description = "Maximum acceptable percent of other requests" + default = 5 +} + +variable "unauthorized_requests_threshold_critical" { + description = "Maximum acceptable percent of unauthorized requests" + default = 5 +} + +variable "successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests" + default = 90 +} diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf new file mode 100644 index 0000000..2a23126 --- /dev/null +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -0,0 +1,156 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? + format("dd_monitoring:enabled,dd_azure_apimanagement:enabled,env:%s", var.environment) : + "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "apimgt_status" { + name = "[${var.environment}] API Management status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.failed_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.failed_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_other_requests" { + name = "[${var.environment}] API Management {{name}} too much other requests" + message = "${var.message}" + + query = < ${var.other_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.other_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_unauthorized_requests" { + name = "[${var.environment}] API Management {{name}} too much unauthorized requests" + message = "${var.message}" + + query = < ${var.unauthorized_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.unauthorized_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_successful_requests" { + name = "[${var.environment}] API Management {{name}} successful requests rate too low" + message = "${var.message}" + + query = < ${var.response_time_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.response_time_threshold_warning}" + critical = "${var.response_time_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services memory usage +resource "datadog_monitor" "appservices_memory_usage_count" { + name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.memory_usage_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.memory_usage_threshold_warning}" + critical = "${var.memory_usage_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services 5xx errors percent +resource "datadog_monitor" "appservices_http_5xx_errors_count" { + name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_5xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services 4xx errors percent +resource "datadog_monitor" "appservices_http_4xx_errors_count" { + name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_4xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_4xx_requests_threshold_warning}" + critical = "${var.http_4xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services HTTP 2xx status pages percent +resource "datadog_monitor" "appservices_http_2xx_status_rate" { + name = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.failed_requests_rate_thresold_critical} + EOF + type = "metric alert" + + thresholds { + critical = "${var.failed_requests_rate_thresold_critical}" + warning = "${var.failed_requests_rate_thresold_warning}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "eventhub_errors" { + name = "[${var.environment}] Event Hub too much errors on {{name}}" + message = "${var.message}" + + query = < ${var.errors_rate_thresold_critical} + EOF + type = "metric alert" + + thresholds { + critical = "${var.errors_rate_thresold_critical}" + warning = "${var.errors_rate_thresold_warning}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"] +} diff --git a/cloud/azure/eventhub/outputs.tf b/cloud/azure/eventhub/outputs.tf new file mode 100644 index 0000000..b9d1822 --- /dev/null +++ b/cloud/azure/eventhub/outputs.tf @@ -0,0 +1,11 @@ +output "status_monitor_id" { + value = "${datadog_monitor.eventhub_failed_requests.id}" +} + +output "failed_requests_monitor_id" { + value = "${datadog_monitor.eventhub_status.id}" +} + +output "errors_monitor_id" { + value = "${datadog_monitor.eventhub_errors.id}" +} diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf new file mode 100644 index 0000000..775fc3e --- /dev/null +++ b/cloud/azure/inputs.tf @@ -0,0 +1,397 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "message" { + description = "Message sent when a monitor is triggered" + type = "string" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "non_taggable_filter_tags" { + description = "Tags used for filtering for components without tag support" + default = "*" +} + +# Azure API Management specific variables +variable "apimanagement_failed_requests_threshold_critical" { + description = "Maximum acceptable percent of failed requests" + default = 5 +} + +variable "apimanagement_other_requests_threshold_critical" { + description = "Maximum acceptable percent of other requests" + default = 5 +} + +variable "apimanagement_unauthorized_requests_threshold_critical" { + description = "Maximum acceptable percent of unauthorized requests" + default = 5 +} + +variable "apimanagement_successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests" + default = 90 +} + +# Azure App Services specific variables +variable "appservices_response_time_threshold_critical" { + default = 0.8 + description = "Alerting threshold in seconds" +} + +variable "appservices_response_time_threshold_warning" { + default = 0.4 + description = "Warning threshold in seconds" +} + +variable "appservices_memory_usage_threshold_critical" { + default = 52430000 + description = "Alerting threshold in Mib" +} + +variable "appservices_memory_usage_threshold_warning" { + default = 33550000 + description = "Warning threshold in MiB" +} + +variable "appservices_http_4xx_requests_threshold_critical" { + default = 30 + description = "Maximum critical acceptable percent of 4xx errors" +} + +variable "appservices_http_4xx_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 4xx errors" +} + +variable "appservices_http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "appservices_http_5xx_requests_threshold_warning" { + default = 10 + description = "Maximum warning acceptable percent of 5xx errors" +} + +variable "appservices_http_2xx_requests_threshold_critical" { + default = 90 + description = "Minimum critical acceptable percent of 2xx requests" +} + +variable "appservices_http_2xx_requests_threshold_warning" { + default = 95 + description = "Minimum warning acceptable percent of 2xx requests" +} + +# Azure Event Hub specific variables +variable "eventhub_failed_requests_rate_thresold_critical" { + description = "Failed requests ratio (percentage) to trigger the critical alert" + default = 3 +} + +variable "eventhub_failed_requests_rate_thresold_warning" { + description = "Failed requests ratio (percentage) to trigger a warning alert" + default = 1 +} + +variable "eventhub_errors_rate_thresold_critical" { + description = "Errors ratio (percentage) to trigger the critical alert" + default = 3 +} + +variable "eventhub_errors_rate_thresold_warning" { + description = "Errors ratio (percentage) to trigger a warning alert" + default = 1 +} + +# IOT Hub specific variables +variable "iothub_failed_jobs_rate_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_jobs_rate_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_listjobs_rate_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_listjobs_rate_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_queryjobs_rate_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_queryjobs_rate_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_methods_rate_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_methods_rate_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_twin_read_rate_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_twin_read_rate_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_twin_update_rate_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_twin_update_rate_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_d2c_twin_read_rate_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_d2c_twin_read_rate_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_d2c_twin_update_rate_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_d2c_twin_update_rate_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 +} + +variable "iothub_dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 +} + +variable "iothub_orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 +} + +variable "iothub_orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 +} + +variable "iothub_invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 +} + +variable "iothub_invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 +} + +variable "iothub_fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 +} + +variable "iothub_fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 +} + +# Azure Redis specific variables +variable "redis_evictedkeys_limit_threshold_warning" { + description = "Evicted keys limit (warning threshold)" + default = 0 +} + +variable "redis_evictedkeys_limit_threshold_critical" { + description = "Evicted keys limit (critical threshold)" + default = 100 +} + +variable "redis_percent_processor_time_threshold_critical" { + description = "Processor time percent (critical threshold)" + default = 80 +} + +variable "redis_percent_processor_time_threshold_warning" { + description = "Processor time percent (warning threshold)" + default = 60 +} + +variable "redis_server_load_rate_threshold_critical" { + description = "Server CPU load rate (critical threshold)" + default = 90 +} + +variable "redis_server_load_rate_threshold_warning" { + description = "Server CPU load rate (warning threshold)" + default = 70 +} + +# Azure SQL Database specific variables +variable "sqldatabase_cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "sqldatabase_cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "sqldatabase_diskspace_threshold_warning" { + description = "Disk space used in percent (warning threshold)" + default = "80" +} + +variable "sqldatabase_diskspace_threshold_critical" { + description = "Disk space used in percent (critical threshold)" + default = "90" +} + +variable "sqldatabase_dtu_threshold_warning" { + description = "Amount of DTU used (warning threshold)" + default = "85" +} + +variable "sqldatabase_dtu_threshold_critical" { + description = "Amount of DTU used (critical threshold)" + default = "90" +} + +variable "sqldatabase_deadlock_threshold_critical" { + description = "Amount of Deadlocks (critical threshold)" + default = "1" +} + +# Azure Storage specific variables +variable "storage_availability_threshold_critical" { + description = "Minimum acceptable percent of availability for a storage" + default = 90 +} + +variable "storage_successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests for a storage" + default = 90 +} + +variable "storage_latency_threshold_critical" { + description = "Maximum acceptable end to end latency (ms) for a storage" + default = 1000 +} + +variable "storage_timeout_error_requests_threshold_critical" { + description = "Maximum acceptable percent of timeout error requests for a storage" + default = 5 +} + +variable "storage_network_error_requests_threshold_critical" { + description = "Maximum acceptable percent of network error requests for a storage" + default = 5 +} + +variable "storage_throttling_error_requests_threshold_critical" { + description = "Maximum acceptable percent of throttling error requests for a storage" + default = 10 +} + +variable "storage_server_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of server other error requests for a storage" + default = 10 +} + +variable "storage_client_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of client other error requests for a storage" + default = 15 +} + +variable "storage_authorization_error_requests_threshold_critical" { + description = "Maximum acceptable percent of authorization error requests for a storage" + default = 15 +} + +# Azure Stream Analytics specific variables +variable "streamanalytics_su_utilization_threshold_warning" { + description = "Streaming Unit utilization rate limit (warning threshold)" + default = 60 +} + +variable "streamanalytics_su_utilization_threshold_critical" { + description = "Streaming Unit utilization rate limit (critical threshold)" + default = 80 +} + +variable "streamanalytics_function_requests_threshold_warning" { + description = "Failed Function Request rate limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_failed_function_requests_threshold_critical" { + description = "Failed Function Request rate limit (critical threshold)" + default = 10 +} + +variable "streamanalytics_conversion_errors_threshold_warning" { + description = "Conversion errors limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_conversion_errors_threshold_critical" { + description = "Conversion errors limit (critical threshold)" + default = 10 +} + +variable "streamanalytics_runtime_errors_threshold_warning" { + description = "Runtime errors limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_runtime_errors_threshold_critical" { + description = "Runtime errors limit (critical threshold)" + default = 10 +} diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md new file mode 100644 index 0000000..e594a65 --- /dev/null +++ b/cloud/azure/iothubs/README.md @@ -0,0 +1,75 @@ +Azure IOT Hubs DataDog monitors +=============================== + +How to use this module +---------------------- + +``` +module "iothubs" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Jobs failed average check +* Query Jobs failed average check +* List Jobs failed average check +* Total devices count check +* C2D methods failed average check +* C2D twin read failed average check +* C2D twin update failed average check +* D2C twin read failed average check +* D2C twin update failed average check +* D2C telemetry egress dropped count check +* D2C telemetry egress orphaned count check +* D2C telemetry egress invalid count check +* D2C telemetry egress fallback count check +* D2C telemetry ingress no sent count check + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | +| environment | Architecture Environment | string | - | yes | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) + +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf new file mode 100644 index 0000000..68c9965 --- /dev/null +++ b/cloud/azure/iothubs/inputs.tf @@ -0,0 +1,141 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" +} + +# Azure IOT hubs specific +variable "failed_jobs_rate_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_jobs_rate_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_listjobs_rate_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_listjobs_rate_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_queryjobs_rate_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_queryjobs_rate_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_methods_rate_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_methods_rate_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_twin_read_rate_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_twin_read_rate_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_twin_update_rate_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_twin_update_rate_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_d2c_twin_read_rate_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_d2c_twin_read_rate_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_d2c_twin_update_rate_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_d2c_twin_update_rate_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 +} + +variable "dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 +} + +variable "orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 +} + +variable "orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 +} + +variable "invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 +} + +variable "invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 +} + +variable "fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 +} + +variable "fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 +} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf new file mode 100644 index 0000000..5040c58 --- /dev/null +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -0,0 +1,462 @@ +resource "datadog_monitor" "too_many_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" + message = "${var.message}" + + query = < ${var.failed_jobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_jobs_rate_threshold_warning}" + critical = "${var.failed_jobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_list_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_listjobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_listjobs_rate_threshold_warning}" + critical = "${var.failed_listjobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_query_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" + message = "${var.message}" + + query = < ${var.failed_queryjobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_queryjobs_rate_threshold_warning}" + critical = "${var.failed_queryjobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_methods_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_methods_rate_threshold_warning}" + critical = "${var.failed_c2d_methods_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_c2d_twin_read_failed" { + name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_twin_read_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_read_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_c2d_twin_update_failed" { + name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_twin_update_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_update_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_twin_read_failed" { + name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_d2c_twin_read_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_read_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_twin_update_failed" { + name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_d2c_twin_update_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_update_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" + message = "${var.message}" + + query = < ${var.dropped_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" + critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" + message = "${var.message}" + + query = < ${var.orphaned_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" + critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" + message = "${var.message}" + + query = < ${var.invalid_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" + critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" + message = "${var.message}" + + query = < ${var.fallback_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" + critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" + message = "${var.message}" + + query = < 0 + EOF + + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf new file mode 100644 index 0000000..fc9aeee --- /dev/null +++ b/cloud/azure/monitors.tf @@ -0,0 +1,166 @@ +module "apimanagement" { + source = "./apimanagement" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}" + other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}" + successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}" + unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}" +} + +module "appservices" { + source = "./app-services" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + http_2xx_requests_threshold_critical = "${var.appservices_http_2xx_requests_threshold_critical}" + http_2xx_requests_threshold_warning = "${var.appservices_http_2xx_requests_threshold_warning}" + http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}" + http_5xx_requests_threshold_warning = "${var.appservices_http_5xx_requests_threshold_warning}" + http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}" + http_4xx_requests_threshold_warning = "${var.appservices_http_4xx_requests_threshold_warning}" + memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}" + memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}" + response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}" + response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}" +} + +module "eventhub" { + source = "./eventhub" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + errors_rate_thresold_critical = "${var.eventhub_errors_rate_thresold_critical}" + errors_rate_thresold_warning = "${var.eventhub_errors_rate_thresold_warning}" + failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}" + failed_requests_rate_thresold_warning = "${var.eventhub_failed_requests_rate_thresold_warning}" +} + +module "iothub" { + source = "./iothubs" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags = "${var.non_taggable_filter_tags}" + + dropped_d2c_telemetry_egress_threshold_critical = "${var.iothub_dropped_d2c_telemetry_egress_threshold_critical}" + dropped_d2c_telemetry_egress_threshold_warning = "${var.iothub_dropped_d2c_telemetry_egress_threshold_warning}" + failed_c2d_methods_rate_threshold_critical = "${var.iothub_failed_c2d_methods_rate_threshold_critical}" + failed_c2d_methods_rate_threshold_warning = "${var.iothub_failed_c2d_methods_rate_threshold_warning}" + failed_c2d_twin_read_rate_threshold_critical = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}" + failed_c2d_twin_read_rate_threshold_warning = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}" + failed_c2d_twin_update_rate_threshold_critical = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}" + failed_c2d_twin_update_rate_threshold_warning = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}" + failed_d2c_twin_read_rate_threshold_critical = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}" + failed_d2c_twin_read_rate_threshold_warning = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}" + failed_d2c_twin_update_rate_threshold_critical = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}" + failed_d2c_twin_update_rate_threshold_warning = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}" + failed_jobs_rate_threshold_critical = "${var.iothub_failed_jobs_rate_threshold_critical}" + failed_jobs_rate_threshold_warning = "${var.iothub_failed_jobs_rate_threshold_warning}" + failed_listjobs_rate_threshold_critical = "${var.iothub_failed_listjobs_rate_threshold_critical}" + failed_listjobs_rate_threshold_warning = "${var.iothub_failed_listjobs_rate_threshold_warning}" + failed_queryjobs_rate_threshold_critical = "${var.iothub_failed_queryjobs_rate_threshold_critical}" + failed_queryjobs_rate_threshold_warning = "${var.iothub_failed_queryjobs_rate_threshold_warning}" + fallback_d2c_telemetry_egress_threshold_critical = "${var.iothub_fallback_d2c_telemetry_egress_threshold_critical}" + fallback_d2c_telemetry_egress_threshold_warning = "${var.iothub_fallback_d2c_telemetry_egress_threshold_warning}" + invalid_d2c_telemetry_egress_threshold_critical = "${var.iothub_invalid_d2c_telemetry_egress_threshold_critical}" + invalid_d2c_telemetry_egress_threshold_warning = "${var.iothub_invalid_d2c_telemetry_egress_threshold_warning}" + orphaned_d2c_telemetry_egress_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_critical}" + orphaned_d2c_telemetry_egress_threshold_warning = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_warning}" +} + +module "redis" { + source = "./redis" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + evictedkeys_limit_threshold_critical = "${var.redis_evictedkeys_limit_threshold_critical}" + evictedkeys_limit_threshold_warning = "${var.redis_evictedkeys_limit_threshold_warning}" + percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}" + percent_processor_time_threshold_warning = "${var.redis_percent_processor_time_threshold_warning}" + server_load_rate_threshold_critical = "${var.redis_server_load_rate_threshold_critical}" + server_load_rate_threshold_warning = "${var.redis_server_load_rate_threshold_warning}" +} + +module "sqldatabase" { + source = "./sql-database" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + cpu_threshold_critical = "${var.sqldatabase_cpu_threshold_critical}" + cpu_threshold_warning = "${var.sqldatabase_cpu_threshold_warning}" + deadlock_threshold_critical = "${var.sqldatabase_deadlock_threshold_critical}" + diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}" + diskspace_threshold_warning = "${var.sqldatabase_diskspace_threshold_warning}" + dtu_threshold_critical = "${var.sqldatabase_dtu_threshold_critical}" + dtu_threshold_warning = "${var.sqldatabase_dtu_threshold_warning}" +} + +module "storage" { + source = "./storage" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}" + availability_threshold_critical = "${var.storage_availability_threshold_critical}" + client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}" + latency_threshold_critical = "${var.storage_latency_threshold_critical}" + network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}" + server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}" + successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}" + throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}" + timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}" +} + +module "streamanalytics" { + source = "./stream-analytics" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + conversion_errors_threshold_critical = "${var.streamanalytics_conversion_errors_threshold_critical}" + conversion_errors_threshold_warning = "${var.streamanalytics_conversion_errors_threshold_warning}" + failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}" + function_requests_threshold_warning = "${var.streamanalytics_function_requests_threshold_warning}" + runtime_errors_threshold_critical = "${var.streamanalytics_runtime_errors_threshold_critical}" + runtime_errors_threshold_warning = "${var.streamanalytics_runtime_errors_threshold_warning}" + su_utilization_threshold_critical = "${var.streamanalytics_su_utilization_threshold_critical}" + su_utilization_threshold_warning = "${var.streamanalytics_su_utilization_threshold_warning}" +} diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md new file mode 100644 index 0000000..4cd7a51 --- /dev/null +++ b/cloud/azure/redis/README.md @@ -0,0 +1,47 @@ +Azure Redis DataDog monitors +============================ + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-redis" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Evicted keys count check +* Processor time (percent) threshold +* Server CPU load threshold + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | +| evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | +| percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | +| server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_redis_cache/](https://docs.datadoghq.com/integrations/azure_redis_cache/) + +Azure Redis metrics documentation: [https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor](https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor) diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf new file mode 100644 index 0000000..49750fa --- /dev/null +++ b/cloud/azure/redis/inputs.tf @@ -0,0 +1,56 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure Redis specific +variable "evictedkeys_limit_threshold_warning" { + description = "Evicted keys limit (warning threshold)" + default = 0 +} + +variable "evictedkeys_limit_threshold_critical" { + description = "Evicted keys limit (critical threshold)" + default = 100 +} + +variable "percent_processor_time_threshold_critical" { + description = "Processor time percent (critical threshold)" + default = 80 +} + +variable "percent_processor_time_threshold_warning" { + description = "Processor time percent (warning threshold)" + default = 60 +} + +variable "server_load_rate_threshold_critical" { + description = "Server CPU load rate (critical threshold)" + default = 90 +} + +variable "server_load_rate_threshold_warning" { + description = "Server CPU load rate (warning threshold)" + default = 70 +} diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf new file mode 100644 index 0000000..8e68558 --- /dev/null +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -0,0 +1,124 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] Redis {{name}} is down" + message = "${var.message}" + + query = < ${var.evictedkeys_limit_threshold_critical} +EOF + + type = "metric alert" + + thresholds { + warning = "${var.evictedkeys_limit_threshold_warning}" + critical = "${var.evictedkeys_limit_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "percent_processor_time" { + name = "[${var.environment}] Redis processor time {{value}}% on {{name}}" + message = "${var.message}" + + query = < ${var.percent_processor_time_threshold_critical} +EOF + + type = "metric alert" + + thresholds { + warning = "${var.percent_processor_time_threshold_warning}" + critical = "${var.percent_processor_time_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "server_load" { + name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}" + message = "${var.message}" + + query = < ${var.server_load_rate_threshold_critical} +EOF + + type = "metric alert" + + thresholds { + warning = "${var.server_load_rate_threshold_warning}" + critical = "${var.server_load_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] +} diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md new file mode 100644 index 0000000..8f42bde --- /dev/null +++ b/cloud/azure/sql-database/README.md @@ -0,0 +1,49 @@ +Azure SQL Database DataDog monitors +=================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-storage" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* CPU High +* Free disk space low +* DTU Consumption high +* SQL deadlocks + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | +| diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | +| dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | +| environment | Architecture Environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when an alert is triggered | string | - | yes | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_sql_database/](https://docs.datadoghq.com/integrations/azure_sql_database/) + +Azure SQL Database metrics documentation: [https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics](https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics) + diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf new file mode 100644 index 0000000..aa81cfb --- /dev/null +++ b/cloud/azure/sql-database/inputs.tf @@ -0,0 +1,62 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure SQL Database specific + +variable "cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "diskspace_threshold_warning" { + description = "Disk space used in percent (warning threshold)" + default = "80" +} + +variable "diskspace_threshold_critical" { + description = "Disk space used in percent (critical threshold)" + default = "90" +} + +variable "dtu_threshold_warning" { + description = "Amount of DTU used (warning threshold)" + default = "85" +} + +variable "dtu_threshold_critical" { + description = "Amount of DTU used (critical threshold)" + default = "90" +} + +variable "deadlock_threshold_critical" { + description = "Amount of Deadlocks (critical threshold)" + default = "1" +} diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf new file mode 100644 index 0000000..337b28f --- /dev/null +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -0,0 +1,129 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_sqldatabase:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "sql-database_cpu_90_15min" { + name = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}" + message = "${var.message}" + + query = < ${var.cpu_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.cpu_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "sql-database_free_space_low" { + name = "[${var.environment}] SQL Database free space < ${var.diskspace_threshold_critical}% on {{name}}" + message = "${var.message}" + + type = "metric alert" + + query = < ${var.diskspace_threshold_critical} + EOF + + thresholds { + warning = "${var.diskspace_threshold_warning}" + critical = "${var.diskspace_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "sql-database_dtu_consumption_high" { + name = "[${var.environment}] SQL Database DTU Consumption on {{name}} > ${var.dtu_threshold_critical}" + message = "${var.message}" + + type = "metric alert" + + query = < ${var.dtu_threshold_critical} + EOF + + thresholds { + warning = "${var.dtu_threshold_warning}" + critical = "${var.dtu_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "sql-database_deadlocks_count" { + name = "[${var.environment}] SQL Database Deadlocks too high on {{name}}" + message = "${var.message}" + + type = "metric alert" + + query = < ${var.deadlock_threshold_critical} + EOF + + thresholds { + critical = "${var.deadlock_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"] +} diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index 0849152..7702683 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -32,20 +32,20 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | +| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | +| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| message | Message sent when a monitor is triggered | string | - | yes | -| filter_tags_use_defaults | Use default tagging convention | string | `true` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| availability_threshold_critical | Minimum threshold of availability | string | `90` | no | -| successful_requests_threshold_critical | Minimum threshold of successful requests | string | `90` | no | -| latency_threshold_critical | Maximum threshold of latency in ms | string | `1000` | no | -| timeout_error_requests_threshold_critical | Maximum threshold of timeout error requests in percent | string | `35` | no | -| network_error_requests_threshold_critical | Maximum threshold of network error requests in percent | string | `35` | no | -| throttling_error_requests_threshold_critical | Maximum threshold of throttling error requests in percent | string | `50` | no | -| server_other_error_requests_threshold_critical | Maximum threshold of server other error requests in percent | string | `50` | no | -| client_other_error_requests_threshold_critical | Maximum threshold of client other error requests in percent | string | `75` | no | -| authorization_error_requests_threshold_critical | Maximum threshold of authorization error requests in percent | string | `75` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | +| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | +| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | +| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | Related documentation --------------------- diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md new file mode 100644 index 0000000..53422c8 --- /dev/null +++ b/cloud/azure/stream-analytics/README.md @@ -0,0 +1,39 @@ +Azure Stream Analytics DataDog monitors +======================================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-redis" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" + subscription_id = "${var.subscription_id}" +} +``` + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | +| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | +| runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | +| su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf new file mode 100644 index 0000000..ae1186a --- /dev/null +++ b/cloud/azure/stream-analytics/inputs.tf @@ -0,0 +1,66 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure Stream Analytics specific +variable "su_utilization_threshold_warning" { + description = "Streaming Unit utilization rate limit (warning threshold)" + default = 60 +} + +variable "su_utilization_threshold_critical" { + description = "Streaming Unit utilization rate limit (critical threshold)" + default = 80 +} + +variable "function_requests_threshold_warning" { + description = "Failed Function Request rate limit (warning threshold)" + default = 0 +} + +variable "failed_function_requests_threshold_critical" { + description = "Failed Function Request rate limit (critical threshold)" + default = 10 +} + +variable "conversion_errors_threshold_warning" { + description = "Conversion errors limit (warning threshold)" + default = 0 +} + +variable "conversion_errors_threshold_critical" { + description = "Conversion errors limit (critical threshold)" + default = 10 +} + +variable "runtime_errors_threshold_warning" { + description = "Runtime errors limit (warning threshold)" + default = 0 +} + +variable "runtime_errors_threshold_critical" { + description = "Runtime errors limit (critical threshold)" + default = 10 +} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf new file mode 100644 index 0000000..3b1324a --- /dev/null +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -0,0 +1,147 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.su_utilization_threshold_critical} + EOF + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.su_utilization_threshold_warning}" + critical = "${var.su_utilization_threshold_critical}" + } + + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "failed_function_requests" { + name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}" + message = "${var.message}" + + query = < ${var.failed_function_requests_threshold_critical} + EOF + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.function_requests_threshold_warning}" + critical = "${var.failed_function_requests_threshold_critical}" + } + + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "conversion_errors" { + name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" + message = "${var.message}" + + query = < ${var.conversion_errors_threshold_critical} + EOF + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.conversion_errors_threshold_warning}" + critical = "${var.conversion_errors_threshold_critical}" + } + + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "runtime_errors" { + name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" + message = "${var.message}" + + query = < ${var.runtime_errors_threshold_critical} + EOF + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.runtime_errors_threshold_warning}" + critical = "${var.runtime_errors_threshold_critical}" + } + + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] +}