MON-90 Fix merge conflict
This commit is contained in:
commit
31546b707d
50
cloud/azure/app-services/README.md
Normal file
50
cloud/azure/app-services/README.md
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
Azure AppServices (Web, API, Functions) DataDog monitors
|
||||||
|
========================================================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "datadog-monitors-azure-app-services" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Creates a DataDog monitors with the following checks :
|
||||||
|
|
||||||
|
* Response time
|
||||||
|
* Memory usage count
|
||||||
|
* HTTP 404 errors
|
||||||
|
* HTTP 50x errors
|
||||||
|
* HTTP 20x rate
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| environment | Architecture environment | string | - | yes |
|
||||||
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||||
|
| http_2xx_status_rate_limit | | string | `30` | no |
|
||||||
|
| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
|
||||||
|
| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
|
||||||
|
| http_404_errors_count_rate_limit | | string | `30` | no |
|
||||||
|
| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
|
||||||
|
| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
|
||||||
|
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
|
||||||
|
| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
|
||||||
|
| message | Message sent when a monitor is triggered | string | - | yes |
|
||||||
|
| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
|
||||||
|
| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_app_services](https://docs.datadoghq.com/integrations/azure_app_services)
|
||||||
87
cloud/azure/app-services/inputs.tf
Normal file
87
cloud/azure/app-services/inputs.tf
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
variable "environment" {
|
||||||
|
description = "Architecture environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when a monitor is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
###################################
|
||||||
|
### RESPONSE TIME VARIABLES ###
|
||||||
|
###################################
|
||||||
|
|
||||||
|
variable "response_time_threshold_critical" {
|
||||||
|
default = 0.8
|
||||||
|
description = "Alerting threshold in seconds"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "response_time_threshold_warning" {
|
||||||
|
default = 0.4
|
||||||
|
description = "Warning threshold in seconds"
|
||||||
|
}
|
||||||
|
|
||||||
|
###################################
|
||||||
|
### MEMORY USAGE VARIABLES ###
|
||||||
|
###################################
|
||||||
|
|
||||||
|
variable "memory_usage_threshold_critical" {
|
||||||
|
default = 52430000
|
||||||
|
description = "Alerting threshold in Mib"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "memory_usage_threshold_warning" {
|
||||||
|
default = 33550000
|
||||||
|
description = "Warning threshold in MiB"
|
||||||
|
}
|
||||||
|
|
||||||
|
#################################
|
||||||
|
### HTTP 404 status pages ###
|
||||||
|
#################################
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_limit" {
|
||||||
|
default = 30
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_threshold_critical" {
|
||||||
|
default = 30
|
||||||
|
description = "Alerting threshold (number of requests)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_threshold_warning" {
|
||||||
|
default = 10
|
||||||
|
description = "Warning threshold (number of requests)"
|
||||||
|
}
|
||||||
|
|
||||||
|
#################################
|
||||||
|
### HTTP 202 status pages ###
|
||||||
|
#################################
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_limit" {
|
||||||
|
default = 30
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_threshold_critical" {
|
||||||
|
default = 0.9
|
||||||
|
description = "Alerting threshold (percentage)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_threshold_warning" {
|
||||||
|
default = 0.95
|
||||||
|
description = "Warning threshold (percentage)"
|
||||||
|
}
|
||||||
124
cloud/azure/app-services/monitors-app_services.tf
Normal file
124
cloud/azure/app-services/monitors-app_services.tf
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services response time
|
||||||
|
resource "datadog_monitor" "appservices_response_time" {
|
||||||
|
name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}}
|
||||||
|
) >= ${var.response_time_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.response_time_threshold_warning}"
|
||||||
|
critical = "${var.response_time_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true # Will notify when no data is received
|
||||||
|
renotify_interval = 0
|
||||||
|
require_full_window = true
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services memory usage
|
||||||
|
resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||||
|
name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}}
|
||||||
|
) >= ${var.memory_usage_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.memory_usage_threshold_warning}"
|
||||||
|
critical = "${var.memory_usage_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true # Will notify when no data is received
|
||||||
|
renotify_interval = 0
|
||||||
|
require_full_window = true
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services 404 errors rate
|
||||||
|
resource "datadog_monitor" "appservices_http_404_errors_count" {
|
||||||
|
name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
max(last_5m): (
|
||||||
|
per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate())
|
||||||
|
) > ${var.http_404_errors_count_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.http_404_errors_count_rate_threshold_warning}"
|
||||||
|
critical = "${var.http_404_errors_count_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false # Will NOT notify when no data is received
|
||||||
|
renotify_interval = 0
|
||||||
|
require_full_window = true
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services HTTP 2xx status pages rate
|
||||||
|
resource "datadog_monitor" "appservices_http_2xx_status_rate" {
|
||||||
|
name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
|
||||||
|
avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
|
||||||
|
) < ${var.http_2xx_status_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.http_2xx_status_rate_threshold_warning}"
|
||||||
|
critical = "${var.http_2xx_status_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true # Will notify when no data is received
|
||||||
|
renotify_interval = 0
|
||||||
|
require_full_window = true
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
54
cloud/azure/eventhub/README.md
Normal file
54
cloud/azure/eventhub/README.md
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
Event Hub Datadog monitor
|
||||||
|
=========================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "datadog-monitors-azure-eventhub" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
subscription_id = "${var.subscription_id}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Creates a Datadog monitor with the following checks :
|
||||||
|
|
||||||
|
* Service status check
|
||||||
|
* Failed request ratio
|
||||||
|
* Erroneous requests ratio
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| environment | Architecture environment | string | - | yes |
|
||||||
|
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||||
|
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||||
|
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||||
|
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||||
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||||
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| errors_monitor_id | Id of the `errors` monitor |
|
||||||
|
| failed_requests_monitor_id | Id of the `failed requests` monitor |
|
||||||
|
| status_monitor_id | Id of the `status` monitor |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Datadog documentation : [https://docs.datadoghq.com/integrations/azure_event_hub/](https://docs.datadoghq.com/integrations/azure_event_hub/)
|
||||||
|
|
||||||
|
Azure metrics documentation : [https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor)
|
||||||
45
cloud/azure/eventhub/inputs.tf
Normal file
45
cloud/azure/eventhub/inputs.tf
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# Global Terraform
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Global DataDog
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when an alert is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_requests_rate_thresold_critical" {
|
||||||
|
description = "Failed requests ratio (percentage) to trigger the critical alert"
|
||||||
|
default = 3
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_requests_rate_thresold_warning" {
|
||||||
|
description = "Failed requests ratio (percentage) to trigger a warning alert"
|
||||||
|
default = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_rate_thresold_critical" {
|
||||||
|
description = "Errors ratio (percentage) to trigger the critical alert"
|
||||||
|
default = 3
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_rate_thresold_warning" {
|
||||||
|
description = "Errors ratio (percentage) to trigger a warning alert"
|
||||||
|
default = 1
|
||||||
|
}
|
||||||
100
cloud/azure/eventhub/monitors-eventhub.tf
Normal file
100
cloud/azure/eventhub/monitors-eventhub.tf
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "eventhub_status" {
|
||||||
|
name = "[${var.environment}] Event Hub status is not ok on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "eventhub_failed_requests" {
|
||||||
|
name = "[${var.environment}] Event Hub too much failed requests on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
|
||||||
|
) * 100 / (
|
||||||
|
avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {name,resource_group,region}
|
||||||
|
) > ${var.failed_requests_rate_thresold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
critical = "${var.failed_requests_rate_thresold_critical}"
|
||||||
|
warning = "${var.failed_requests_rate_thresold_warning}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "eventhub_errors" {
|
||||||
|
name = "[${var.environment}] Event Hub too much errors on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
|
||||||
|
) * 100 / (
|
||||||
|
avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {name,resource_group,region} +
|
||||||
|
avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {name,resource_group,region}
|
||||||
|
) > ${var.errors_rate_thresold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
critical = "${var.errors_rate_thresold_critical}"
|
||||||
|
warning = "${var.errors_rate_thresold_warning}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
11
cloud/azure/eventhub/outputs.tf
Normal file
11
cloud/azure/eventhub/outputs.tf
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
output "status_monitor_id" {
|
||||||
|
value = "${datadog_monitor.eventhub_failed_requests.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "failed_requests_monitor_id" {
|
||||||
|
value = "${datadog_monitor.eventhub_status.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "errors_monitor_id" {
|
||||||
|
value = "${datadog_monitor.eventhub_errors.id}"
|
||||||
|
}
|
||||||
76
cloud/azure/iothubs/README.md
Normal file
76
cloud/azure/iothubs/README.md
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
Azure IOT Hubs DataDog monitors
|
||||||
|
===============================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "iothubs" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Creates a DataDog monitors with the following checks :
|
||||||
|
|
||||||
|
* Service status check
|
||||||
|
* Jobs failed average check
|
||||||
|
* Query Jobs failed average check
|
||||||
|
* List Jobs failed average check
|
||||||
|
* Total devices count check
|
||||||
|
* C2D methods failed average check
|
||||||
|
* C2D twin read failed average check
|
||||||
|
* C2D twin update failed average check
|
||||||
|
* D2C twin read failed average check
|
||||||
|
* D2C twin update failed average check
|
||||||
|
* D2C telemetry egress dropped count check
|
||||||
|
* D2C telemetry egress orphaned count check
|
||||||
|
* D2C telemetry egress invalid count check
|
||||||
|
* D2C telemetry egress fallback count check
|
||||||
|
* D2C telemetry ingress no sent count check
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
|
||||||
|
| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
|
||||||
|
| environment | Architecture Environment | string | - | yes |
|
||||||
|
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
|
||||||
|
| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
|
||||||
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||||
|
| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
|
||||||
|
| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
|
||||||
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
|
| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
|
||||||
|
| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
|
||||||
|
|
||||||
|
Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)
|
||||||
146
cloud/azure/iothubs/inputs.tf
Normal file
146
cloud/azure/iothubs/inputs.tf
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
# Global Terraform
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture Environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Global DataDog
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when an alert is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure IOT hubs specific
|
||||||
|
variable "failed_jobs_rate_threshold_warning" {
|
||||||
|
description = "Jobs Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_jobs_rate_threshold_critical" {
|
||||||
|
description = "Jobs Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_listjobs_rate_threshold_warning" {
|
||||||
|
description = "ListJobs Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_listjobs_rate_threshold_critical" {
|
||||||
|
description = "ListJobs Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_queryjobs_rate_threshold_warning" {
|
||||||
|
description = "QueryJobs Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_queryjobs_rate_threshold_critical" {
|
||||||
|
description = "QueryJobs Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_methods_rate_threshold_warning" {
|
||||||
|
description = "C2D Methods Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_methods_rate_threshold_critical" {
|
||||||
|
description = "C2D Methods Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_read_rate_threshold_warning" {
|
||||||
|
description = "C2D Twin Read Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_read_rate_threshold_critical" {
|
||||||
|
description = "C2D Twin Read Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_update_rate_threshold_warning" {
|
||||||
|
description = "C2D Twin Update Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_update_rate_threshold_critical" {
|
||||||
|
description = "C2D Twin Update Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_read_rate_threshold_warning" {
|
||||||
|
description = "D2C Twin Read Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_read_rate_threshold_critical" {
|
||||||
|
description = "D2C Twin Read Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_update_rate_threshold_warning" {
|
||||||
|
description = "D2C Twin Update Failed rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_update_rate_threshold_critical" {
|
||||||
|
description = "D2C Twin Update Failed rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dropped_d2c_telemetry_egress_threshold_warning" {
|
||||||
|
description = "D2C Telemetry Dropped limit (warning threshold)"
|
||||||
|
default = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dropped_d2c_telemetry_egress_threshold_critical" {
|
||||||
|
description = "D2C Telemetry Dropped limit (critical threshold)"
|
||||||
|
default = 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "orphaned_d2c_telemetry_egress_threshold_warning" {
|
||||||
|
description = "D2C Telemetry Orphaned limit (warning threshold)"
|
||||||
|
default = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "orphaned_d2c_telemetry_egress_threshold_critical" {
|
||||||
|
description = "D2C Telemetry Orphaned limit (critical threshold)"
|
||||||
|
default = 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invalid_d2c_telemetry_egress_threshold_warning" {
|
||||||
|
description = "D2C Telemetry Invalid limit (warning threshold)"
|
||||||
|
default = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invalid_d2c_telemetry_egress_threshold_critical" {
|
||||||
|
description = "D2C Telemetry Invalid limit (critical threshold)"
|
||||||
|
default = 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "fallback_d2c_telemetry_egress_threshold_warning" {
|
||||||
|
description = "D2C Telemetry Fallback limit (warning threshold)"
|
||||||
|
default = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "fallback_d2c_telemetry_egress_threshold_critical" {
|
||||||
|
description = "D2C Telemetry Fallback limit (critical threshold)"
|
||||||
|
default = 1000
|
||||||
|
}
|
||||||
470
cloud/azure/iothubs/monitors-iothubs.tf
Normal file
470
cloud/azure/iothubs/monitors-iothubs.tf
Normal file
@ -0,0 +1,470 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_jobs_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_jobs_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_jobs_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_jobs_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
|
||||||
|
) * 100 > ${var.failed_listjobs_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_listjobs_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_listjobs_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() )
|
||||||
|
) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_queryjobs_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_queryjobs_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "status" {
|
||||||
|
name = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "total_devices" {
|
||||||
|
name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.c2d.methods.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.c2d.methods.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_c2d_methods_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_c2d_methods_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_c2d_twin_read_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_c2d_twin_read_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_c2d_twin_update_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_c2d_twin_update_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_d2c_twin_read_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_d2c_twin_read_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):(
|
||||||
|
avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() +
|
||||||
|
avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() )
|
||||||
|
) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.failed_d2c_twin_update_rate_threshold_warning}"
|
||||||
|
critical = "${var.failed_d2c_twin_update_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > ${var.dropped_d2c_telemetry_egress_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}"
|
||||||
|
critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > ${var.orphaned_d2c_telemetry_egress_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}"
|
||||||
|
critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > ${var.invalid_d2c_telemetry_egress_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}"
|
||||||
|
critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > ${var.fallback_d2c_telemetry_egress_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}"
|
||||||
|
critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||||
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() -
|
||||||
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > 0
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
47
cloud/azure/redis/README.md
Normal file
47
cloud/azure/redis/README.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
Azure Redis DataDog monitors
|
||||||
|
============================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "datadog-monitors-azure-redis" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Creates a DataDog monitors with the following checks :
|
||||||
|
|
||||||
|
* Service status check
|
||||||
|
* Evicted keys count check
|
||||||
|
* Processor time (percent) threshold
|
||||||
|
* Server CPU load threshold
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| environment | Architecture environment | string | - | yes |
|
||||||
|
| evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
|
||||||
|
| evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
|
||||||
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||||
|
| message | Message sent when a Redis monitor is triggered | string | - | yes |
|
||||||
|
| percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
|
||||||
|
| percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
|
||||||
|
| server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
|
||||||
|
| server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_redis_cache/](https://docs.datadoghq.com/integrations/azure_redis_cache/)
|
||||||
|
|
||||||
|
Azure Redis metrics documentation: [https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor](https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor)
|
||||||
56
cloud/azure/redis/inputs.tf
Normal file
56
cloud/azure/redis/inputs.tf
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# Global Terraform
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Global DataDog
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when a Redis monitor is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure Redis specific
|
||||||
|
variable "evictedkeys_limit_threshold_warning" {
|
||||||
|
description = "Evicted keys limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "evictedkeys_limit_threshold_critical" {
|
||||||
|
description = "Evicted keys limit (critical threshold)"
|
||||||
|
default = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "percent_processor_time_threshold_critical" {
|
||||||
|
description = "Processor time percent (critical threshold)"
|
||||||
|
default = 80
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "percent_processor_time_threshold_warning" {
|
||||||
|
description = "Processor time percent (warning threshold)"
|
||||||
|
default = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "server_load_rate_threshold_critical" {
|
||||||
|
description = "Server CPU load rate (critical threshold)"
|
||||||
|
default = 90
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "server_load_rate_threshold_warning" {
|
||||||
|
description = "Server CPU load rate (warning threshold)"
|
||||||
|
default = 70
|
||||||
|
}
|
||||||
124
cloud/azure/redis/monitors-azure-redis.tf
Normal file
124
cloud/azure/redis/monitors-azure-redis.tf
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "status" {
|
||||||
|
name = "[${var.environment}] Redis {{name}} is down"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "evictedkeys" {
|
||||||
|
name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.evictedkeys_limit_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.evictedkeys_limit_threshold_warning}"
|
||||||
|
critical = "${var.evictedkeys_limit_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "percent_processor_time" {
|
||||||
|
name = "[${var.environment}] Redis processor time {{value}}% on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.percent_processor_time_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.percent_processor_time_threshold_warning}"
|
||||||
|
critical = "${var.percent_processor_time_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "server_load" {
|
||||||
|
name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.server_load_rate_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.server_load_rate_threshold_warning}"
|
||||||
|
critical = "${var.server_load_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
49
cloud/azure/sql-database/README.md
Normal file
49
cloud/azure/sql-database/README.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
Azure SQL Database DataDog monitors
|
||||||
|
===================================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "datadog-monitors-azure-storage" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Creates a DataDog monitors with the following checks :
|
||||||
|
|
||||||
|
* CPU High
|
||||||
|
* Free disk space low
|
||||||
|
* DTU Consumption high
|
||||||
|
* SQL deadlocks
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
|
||||||
|
| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no |
|
||||||
|
| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
|
||||||
|
| diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
|
||||||
|
| dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
|
||||||
|
| dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
|
||||||
|
| environment | Architecture Environment | string | - | yes |
|
||||||
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||||
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_sql_database/](https://docs.datadoghq.com/integrations/azure_sql_database/)
|
||||||
|
|
||||||
|
Azure SQL Database metrics documentation: [https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics](https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics)
|
||||||
|
|
||||||
62
cloud/azure/sql-database/inputs.tf
Normal file
62
cloud/azure/sql-database/inputs.tf
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# Global Terraform
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture Environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Global DataDog
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when an alert is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure SQL Database specific
|
||||||
|
|
||||||
|
variable "cpu_threshold_warning" {
|
||||||
|
description = "CPU usage in percent (warning threshold)"
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_threshold_critical" {
|
||||||
|
description = "CPU usage in percent (critical threshold)"
|
||||||
|
default = "90"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "diskspace_threshold_warning" {
|
||||||
|
description = "Disk space used in percent (warning threshold)"
|
||||||
|
default = "80"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "diskspace_threshold_critical" {
|
||||||
|
description = "Disk space used in percent (critical threshold)"
|
||||||
|
default = "90"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dtu_threshold_warning" {
|
||||||
|
description = "Amount of DTU used (warning threshold)"
|
||||||
|
default = "85"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dtu_threshold_critical" {
|
||||||
|
description = "Amount of DTU used (critical threshold)"
|
||||||
|
default = "90"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "deadlock_threshold_critical" {
|
||||||
|
description = "Amount of Deadlocks (critical threshold)"
|
||||||
|
default = "1"
|
||||||
|
}
|
||||||
129
cloud/azure/sql-database/monitors-sql-database-basics.tf
Normal file
129
cloud/azure/sql-database/monitors-sql-database-basics.tf
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_sqldatabase:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||||
|
name = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_15m): (
|
||||||
|
avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.cpu_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
critical = "${var.cpu_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "sql-database_free_space_low" {
|
||||||
|
name = "[${var.environment}] SQL Database free space < ${var.diskspace_threshold_critical}% on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_15m): (
|
||||||
|
avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.diskspace_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.diskspace_threshold_warning}"
|
||||||
|
critical = "${var.diskspace_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||||
|
name = "[${var.environment}] SQL Database DTU Consumption on {{name}} > ${var.dtu_threshold_critical}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_15m): (
|
||||||
|
azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.dtu_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.dtu_threshold_warning}"
|
||||||
|
critical = "${var.dtu_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "sql-database_deadlocks_count" {
|
||||||
|
name = "[${var.environment}] SQL Database Deadlocks too high on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
sum(last_5m): (
|
||||||
|
avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) > ${var.deadlock_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
critical = "${var.deadlock_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
41
cloud/azure/stream-analytics/README.md
Normal file
41
cloud/azure/stream-analytics/README.md
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
Azure Stream Analytics DataDog monitors
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
How to use this module
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
module "datadog-monitors-azure-redis" {
|
||||||
|
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}"
|
||||||
|
|
||||||
|
message = "${module.datadog-message-alerting.alerting-message}"
|
||||||
|
environment = "${var.environment}"
|
||||||
|
subscription_id = "${var.subscription_id}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Inputs
|
||||||
|
------
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|:----:|:-----:|:-----:|
|
||||||
|
| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
|
||||||
|
| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
|
||||||
|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||||
|
| environment | Architecture environment | string | - | yes |
|
||||||
|
| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
|
||||||
|
| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| message | Message sent when a monitor is triggered | string | - | yes |
|
||||||
|
| provider | What is the monitored provider | string | azure | no |
|
||||||
|
| runtime_errors_threshold_critical | | string | `10` | no |
|
||||||
|
| runtime_errors_threshold_warning | | string | `0` | no |
|
||||||
|
| su_utilization_threshold_critical | | string | `80` | no |
|
||||||
|
| su_utilization_threshold_warning | Monitor specific | string | `60` | no |
|
||||||
|
| service | What is the monitored service | string | storage | no |
|
||||||
|
| subscription_id | Azure account id used as filter for monitors | string | - | yes |
|
||||||
|
| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
|
||||||
|
|
||||||
|
Related documentation
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
|
||||||
66
cloud/azure/stream-analytics/inputs.tf
Normal file
66
cloud/azure/stream-analytics/inputs.tf
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
# Global Terraform
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture environment"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Global DataDog
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when a Redis monitor is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure Stream Analytics specific
|
||||||
|
variable "su_utilization_threshold_warning" {
|
||||||
|
description = "Streaming Unit utilization rate limit (warning threshold)"
|
||||||
|
default = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "su_utilization_threshold_critical" {
|
||||||
|
description = "Streaming Unit utilization rate limit (critical threshold)"
|
||||||
|
default = 80
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "function_requests_threshold_warning" {
|
||||||
|
description = "Failed Function Request rate limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "failed_function_requests_threshold_critical" {
|
||||||
|
description = "Failed Function Request rate limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "conversion_errors_threshold_warning" {
|
||||||
|
description = "Conversion errors limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "conversion_errors_threshold_critical" {
|
||||||
|
description = "Conversion errors limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "runtime_errors_threshold_warning" {
|
||||||
|
description = "Runtime errors limit (warning threshold)"
|
||||||
|
default = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "runtime_errors_threshold_critical" {
|
||||||
|
description = "Runtime errors limit (critical threshold)"
|
||||||
|
default = 10
|
||||||
|
}
|
||||||
147
cloud/azure/stream-analytics/monitors-stream-analytics.tf
Normal file
147
cloud/azure/stream-analytics/monitors-stream-analytics.tf
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
data "template_file" "filter" {
|
||||||
|
template = "$${filter}"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "status" {
|
||||||
|
name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "su_utilization" {
|
||||||
|
name = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.su_utilization_threshold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.su_utilization_threshold_warning}"
|
||||||
|
critical = "${var.su_utilization_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "failed_function_requests" {
|
||||||
|
name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() /
|
||||||
|
avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count()
|
||||||
|
) * 100 > ${var.failed_function_requests_threshold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.function_requests_threshold_warning}"
|
||||||
|
critical = "${var.failed_function_requests_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "conversion_errors" {
|
||||||
|
name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.conversion_errors_threshold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.conversion_errors_threshold_warning}"
|
||||||
|
critical = "${var.conversion_errors_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "runtime_errors" {
|
||||||
|
name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
|
||||||
|
message = "${var.message}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
avg(last_5m): (
|
||||||
|
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group}
|
||||||
|
) > ${var.runtime_errors_threshold_critical}
|
||||||
|
EOF
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
no_data_timeframe = 20
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.runtime_errors_threshold_warning}"
|
||||||
|
critical = "${var.runtime_errors_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||||
|
}
|
||||||
@ -11,5 +11,7 @@ EOF
|
|||||||
vars {
|
vars {
|
||||||
oncall_24x7 = "${var.oncall_24x7}"
|
oncall_24x7 = "${var.oncall_24x7}"
|
||||||
oncall_office_hours = "${var.oncall_office_hours}"
|
oncall_office_hours = "${var.oncall_office_hours}"
|
||||||
|
prepend_text = "${var.prepend_text}"
|
||||||
|
append_text = "${var.append_text}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user