MON-77 Some documentation & lower thresold levels

This commit is contained in:
Laurent Piroelle 2017-10-31 08:51:34 +01:00
parent 1768c1621f
commit 0f22d51e9c
3 changed files with 74 additions and 17 deletions

View File

@ -0,0 +1,53 @@
Event Hub Datadog monitor
=========================
How to use this module
----------------------
```
module "datadog-monitors-azure-eventhub" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}"
message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}"
}
```
Purpose
-------
Creates a Datadog monitor with the following checks :
* Service status check
* Failed request ratio
* Erroneous requests ratio
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
Outputs
-------
| Name | Description |
|------|-------------|
| errors_monitor_id | Id of the `errors` monitor |
| failed_requests_monitor_id | Id of the `failed requests` monitor |
| status_monitor_id | Id of the `status` monitor |
Related documentation
---------------------
Datadog documentation : [https://docs.datadoghq.com/integrations/azure_event_hub/](https://docs.datadoghq.com/integrations/azure_event_hub/)
Azure metrics documentation : [https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor)

View File

@ -1,31 +1,35 @@
variable "environment" {}
variable "down_message" {}
variable "failed_requests_message" {}
variable "errors_message" {}
variable "message" {
description = "Message sent when an alert is triggered"
}
variable "delay" {
description = "Delay in seconds for the metric evaluation"
default = 600
}
variable "failed_requests_rate_thresold_critical" {
default = 5
description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 3
}
variable "failed_requests_rate_thresold_warning" {
default = 3
description = "Failed requests ratio (percentage) to trigger a warning alert"
default = 1
}
variable "errors_rate_thresold_critical" {
default = 5
}
variable "errors_rate_thresold_warning" {
description = "Errors ratio (percentage) to trigger the critical alert"
default = 3
}
variable "errors_rate_thresold_warning" {
description = "Errors ratio (percentage) to trigger a warning alert"
default = 1
}
variable "use_filter_tags" {
description = "Filter the data with service tags if true"
default = "true"
}

View File

@ -9,7 +9,7 @@ data "template_file" "filter" {
resource "datadog_monitor" "eventhub_status" {
name = "[${var.environment}] Event Hub status"
message = "${var.down_message}"
message = "${var.message}"
query = <<EOF
avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {name,resource_group,region} != 1
@ -18,7 +18,7 @@ resource "datadog_monitor" "eventhub_status" {
notify_no_data = true
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
@ -30,7 +30,7 @@ resource "datadog_monitor" "eventhub_status" {
resource "datadog_monitor" "eventhub_failed_requests" {
name = "[${var.environment}] Event Hub failed requests"
message = "${var.failed_requests_message}"
message = "${var.message}"
query = <<EOF
avg(last_5m): (
@ -49,7 +49,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
@ -61,7 +61,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
resource "datadog_monitor" "eventhub_errors" {
name = "[${var.environment}] Event Hub errors"
message = "${var.errors_message}"
message = "${var.message}"
query = <<EOF
avg(last_5m): (
@ -84,7 +84,7 @@ resource "datadog_monitor" "eventhub_errors" {
notify_no_data = false
evaluation_delay = "${var.delay}"
renotify_interval = 60
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true