MON-74 Fix changes to fit as the other modules

This commit is contained in:
Alexandre Gaillet 2017-11-02 16:54:18 +01:00 committed by Laurent Piroelle
parent dc06fb9519
commit 7f0821b8bc
3 changed files with 86 additions and 140 deletions

View File

@ -28,52 +28,64 @@ Creates a DataDog monitors with the following checks :
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~
|------|-------------|:----:|:-----:|:-----:| →
| client_name | Client Name | string | - | yes |
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no |
| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#
m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| http_2xx_status_rate_limit | | string | `30` | no |
| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef
ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s
kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true`
| no |
| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m
onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying
via the API | string | `<list>` | no |
| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no |
| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically
resolve from a triggered state. Defaults to false. | string | `false` | no |
| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write
last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| http_404_errors_count_rate_limit | | string | `30` | no |
| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no |
| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da
ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil
l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `
true` | no |
| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi
lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu
erying via the API | string | `<list>` | no |
| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
| memory_usage_appserv_eval_delay | | string | `600` | no |
| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati
cally resolve from a triggered state. Defaults to false. | string | `false` | no |
| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5,
10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no |
| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's
evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the
API | string | `<list>` | no |
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
from a triggered state. Defaults to false. | string | `false` | no |
| message | Message sent when a monitor is triggered | string | - | yes |
| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no |
| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no |
| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no |
| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no |
| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `<list>` | no |
| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5
, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no |
| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it'
s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped.
Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no |
| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors
in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th
e API | string | `<list>` | no |
| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no |
| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve
from a triggered state. Defaults to false. | string | `false` | no |
| use_filter_tags | Filter the data with service tags if true | string | `true` | no |
Related documentation

View File

@ -3,6 +3,11 @@ variable "environment" {
type = "string"
}
variable "client_name" {
description = "Client Name"
type = "string"
}
variable "use_filter_tags" {
description = "Filter the data with service tags if true"
default = "true"
@ -12,15 +17,15 @@ variable "message" {
description = "Message sent when a monitor is triggered"
}
###################################
### RESPONSE TIME VARIABLES ###
###################################
variable "response_time_appserv_eval_delay" {
variable "delay" {
description = "Delay in seconds for the metric evaluation"
default = 600
}
###################################
### RESPONSE TIME VARIABLES ###
###################################
variable "response_time_threshold_critical" {
default = 0.8
description = "Alerting threshold in seconds"
@ -51,34 +56,15 @@ variable "response_time_timeout_h" {
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "response_time_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "response_time_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "response_time_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "response_time_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}
###################################
### MEMORY USAGE VARIABLES ###
###################################
variable "memory_usage_appserv_eval_delay" {
default = 600
}
variable "memory_usage_threshold_critical" {
default = 52430000
description = "Alerting threshold in Mib"
@ -109,26 +95,11 @@ variable "memory_usage_timeout_h" {
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "memory_usage_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "memory_usage_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "memory_usage_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "memory_usage_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}
#################################
### HTTP 404 status pages ###
#################################
@ -137,10 +108,6 @@ variable "http_404_errors_count_rate_limit" {
default = 30
}
variable "http_404_errors_count_rate_appserv_eval_delay" {
default = 600
}
variable "http_404_errors_count_rate_threshold_critical" {
default = 30
description = "Alerting threshold (number of requests)"
@ -171,26 +138,11 @@ variable "http_404_errors_count_rate_timeout_h" {
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "http_404_errors_count_rate_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "http_404_errors_count_rate_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "http_404_errors_count_rate_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "http_404_errors_count_rate_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}
#################################
### HTTP 202 status pages ###
#################################
@ -199,10 +151,6 @@ variable "http_2xx_status_rate_limit" {
default = 30
}
variable "http_2xx_status_rate_appserv_eval_delay" {
default = 600
}
variable "http_2xx_status_rate_threshold_critical" {
default = 0.9
description = "Alerting threshold (percentage)"
@ -233,22 +181,8 @@ variable "http_2xx_status_rate_timeout_h" {
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "http_2xx_status_rate_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "http_2xx_status_rate_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "http_2xx_status_rate_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "http_2xx_status_rate_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}

View File

@ -8,10 +8,9 @@ data "template_file" "filter" {
# Monitoring App Services response time
resource "datadog_monitor" "appservices_response_time" {
name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
type = "query alert"
message = "${var.message}"
escalation_message = "${var.response_time_escalation_message}"
name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
type = "query alert"
message = "${var.message}"
query = <<EOF
avg(last_${var.response_time_last_time_window_code}): (
@ -19,7 +18,8 @@ resource "datadog_monitor" "appservices_response_time" {
) >= ${var.response_time_threshold_critical}
EOF
evaluation_delay = "${var.response_time_appserv_eval_delay}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
warning = "${var.response_time_threshold_warning}"
@ -27,21 +27,20 @@ resource "datadog_monitor" "appservices_response_time" {
}
notify_no_data = true # Will notify when no data is received
renotify_interval = "${var.response_time_renotify_interval}"
renotify_interval = 0
require_full_window = "${var.response_time_require_full_window}"
timeout_h = "${var.response_time_timeout_h}"
include_tags = "${var.response_time_include_tags}"
include_tags = true
tags = "${var.response_time_tags}"
}
# Monitoring App Services memory usage
resource "datadog_monitor" "appservices_memory_usage_count" {
name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
type = "query alert"
message = "${var.message}"
escalation_message = "${var.memory_usage_escalation_message}"
name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
type = "query alert"
message = "${var.message}"
query = <<EOF
avg(last_${var.memory_usage_last_time_window_code}): (
@ -49,7 +48,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
) >= ${var.memory_usage_threshold_critical}
EOF
evaluation_delay = "${var.memory_usage_appserv_eval_delay}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
warning = "${var.memory_usage_threshold_warning}"
@ -57,21 +57,20 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
}
notify_no_data = true # Will notify when no data is received
renotify_interval = "${var.memory_usage_renotify_interval}"
renotify_interval = 0
require_full_window = "${var.memory_usage_require_full_window}"
timeout_h = "${var.memory_usage_timeout_h}"
include_tags = "${var.memory_usage_include_tags}"
include_tags = true
tags = "${var.memory_usage_tags}"
}
# Monitoring App Services 404 errors rate
resource "datadog_monitor" "appservices_http_404_errors_count" {
name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
type = "query alert"
message = "${var.message}"
escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
type = "query alert"
message = "${var.message}"
query = <<EOF
max(last_${var.http_404_errors_count_rate_last_time_window_code}): (
@ -79,38 +78,39 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
) > ${var.http_404_errors_count_rate_threshold_critical}
EOF
evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
warning = "${var.http_404_errors_count_rate_threshold_warning}"
critical = "${var.http_404_errors_count_rate_threshold_critical}"
}
notify_no_data = false # Will NOT notify when no data is received
renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
notify_no_data = false # Will NOT notify when no data is received
renotify_interval = 0
require_full_window = true
timeout_h = "${var.http_404_errors_count_rate_timeout_h}"
include_tags = "${var.http_404_errors_count_rate_include_tags}"
include_tags = true
tags = "${var.http_404_errors_count_rate_tags}"
}
# Monitoring App Services HTTP 2xx status pages rate
resource "datadog_monitor" "appservices_http_2xx_status_rate" {
name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
type = "query alert"
message = "${var.message}"
escalation_message = "${var.http_2xx_status_rate_escalation_message}"
name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
type = "query alert"
message = "${var.message}"
query = <<EOF
query = <<EOF
avg(last_${var.http_2xx_status_rate_last_time_window_code}): (
avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
) < ${var.http_2xx_status_rate_threshold_critical}
EOF
evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
warning = "${var.http_2xx_status_rate_threshold_warning}"
@ -119,11 +119,11 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" {
# Will notify when no data is received
notify_no_data = true
renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
renotify_interval = 0
require_full_window = true
timeout_h = "${var.http_2xx_status_rate_timeout_h}"
include_tags = "${var.http_2xx_status_rate_include_tags}"
include_tags = true
tags = "${var.http_2xx_status_rate_tags}"
}